diff --git a/main/como/cluster_rnaseq.py b/main/como/cluster_rnaseq.py index 214910d0..6aaed80f 100644 --- a/main/como/cluster_rnaseq.py +++ b/main/como/cluster_rnaseq.py @@ -8,7 +8,7 @@ import numpy as np from como.data_types import LogLevel -from como.utils import log_and_raise_error, stringlist_to_list +from como.utils import stringlist_to_list @dataclass @@ -35,77 +35,43 @@ def __post_init__(self): # noqa: C901, ignore too complex self.seed = np.random.randint(0, 100_000) if (isdigit(self.min_active_count) and int(self.min_active_count) < 0) or self.min_active_count != "default": - log_and_raise_error( - "min_active_count must be either 'default' or an integer > 0", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("min_active_count must be either 'default' or an integer > 0") if (isdigit(self.quantile) and 0 > int(self.quantile) > 100) or self.quantile != "default": - log_and_raise_error( - "quantile must be either 'default' or an integer between 0 and 100", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("quantile must be either 'default' or an integer between 0 and 100") if (isdigit(self.replicate_ratio) and 0 > self.replicate_ratio > 1.0) or self.replicate_ratio != "default": - log_and_raise_error( - "--rep-ratio must be either 'default' or a float between 0 and 1", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("--rep-ratio must be either 'default' or a float between 0 and 1") if (isdigit(self.batch_ratio) and 0 > self.batch_ratio > 1.0) or self.batch_ratio != "default": - log_and_raise_error( - "--batch-ratio must be either 'default' or a float between 0 and 1", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("--batch-ratio must be either 'default' or a float between 0 and 1") if self.filtering_technique.lower() not in {"quantile", "tpm", "cpm", "zfpkm"}: - log_and_raise_error( - "--technique must be either 'quantile', 'tpm', 'cpm', 'zfpkm'", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("--technique must be either 'quantile', 'tpm', 'cpm', 'zfpkm'") if self.filtering_technique.lower() == "tpm": self.filtering_technique = "quantile" if self.cluster_algorithm.lower() not in {"mca", "umap"}: - log_and_raise_error( - "--clust_algo must be either 'mca', 'umap'", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("--clust_algo must be either 'mca', 'umap'") if 0 > self.min_distance > 1.0: - log_and_raise_error( - "--min_dist must be a float between 0 and 1", - error=ValueError, - level=LogLevel.ERROR, - ) - - if (isdigit(self.num_replicate_neighbors) and self.num_replicate_neighbors < 1) or self.num_replicate_neighbors != "default": - log_and_raise_error( - "--n-neighbors-rep must be either 'default' or an integer > 1", - error=ValueError, - level=LogLevel.ERROR, - ) - - if (isdigit(self.num_batch_neighbors) and self.num_batch_neighbors < 1) or self.num_batch_neighbors != "default": - log_and_raise_error( - "--n-neighbors-batch must be either 'default' or an integer > 1", - error=ValueError, - level=LogLevel.ERROR, - ) - - if (isdigit(self.num_context_neighbors) and self.num_context_neighbors < 1) or self.num_context_neighbors != "default": - log_and_raise_error( - "--n-neighbors-context must be either 'default' or an integer > 1", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("--min_dist must be a float between 0 and 1") + + if ( + isdigit(self.num_replicate_neighbors) and self.num_replicate_neighbors < 1 + ) or self.num_replicate_neighbors != "default": + raise ValueError("--n-neighbors-rep must be either 'default' or an integer > 1") + + if ( + isdigit(self.num_batch_neighbors) and self.num_batch_neighbors < 1 + ) or self.num_batch_neighbors != "default": + raise ValueError("--n-neighbors-batch must be either 'default' or an integer > 1") + + if ( + isdigit(self.num_context_neighbors) and self.num_context_neighbors < 1 + ) or self.num_context_neighbors != "default": + raise ValueError("--n-neighbors-context must be either 'default' or an integer > 1") def _parse_args() -> _Arguments: diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 4b5406a8..5bb39d24 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -20,6 +20,7 @@ ) from como.pipelines.identifier import convert from como.utils import LogLevel, get_missing_gene_data, log_and_raise_error, num_columns +from como.utils import num_columns def _combine_z_distribution_for_batch( @@ -191,10 +192,9 @@ def _combine_z_distribution_for_context( for res in zscore_results: matrix = res.z_score_matrix.copy() if len(matrix.columns) > 1: - log_and_raise_error( - f"Expected a single column for combined z-score dataframe for data '{res.type.value.lower()}'. Got '{len(matrix.columns)}' columns", - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Expected a single column for combined z-score dataframe for data '{res.type.value.lower()}'. " + f"Got '{len(matrix.columns)}' columns" ) matrix.columns = [res.type.value.lower()] @@ -327,10 +327,9 @@ async def _begin_combining_distributions( else "" ) if not index_name: - log_and_raise_error( - f"Unable to find common gene identifier across batches for source '{source.value}' in context '{context_name}'", - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Unable to find common gene identifier across batches for source " + f"'{source.value}' in context '{context_name}'" ) merged_batch_results = pd.concat(batch_results, axis="columns") merged_batch_results.index.name = index_name diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index a0410e48..518bf6b3 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -34,7 +34,7 @@ Solver, _BoundaryReactions, ) -from como.utils import log_and_raise_error, set_up_logging, split_gene_expression_data +from como.utils import set_up_logging, split_gene_expression_data def _reaction_indices_to_ids( @@ -235,10 +235,8 @@ def _build_with_fastcore( ) s_matrix = cast(npt.NDArray[np.floating], cobra.util.create_stoichiometric_matrix(model=model)) if lower_bounds.shape[0] != upper_bounds.shape[0] != s_matrix.shape[1]: - log_and_raise_error( - message="Lower bounds, upper bounds, and stoichiometric matrix must have the same number of reactions.", - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + "Lower bounds, upper bounds, and stoichiometric matrix must have the same number of reactions." ) logger.debug("Creating feasible model") _, cobra_model = _feasibility_test(cobra_model, "other") @@ -301,7 +299,7 @@ def _build_with_tinit( solver, idx_force, ) -> Model: - log_and_raise_error("tINIT is not yet implemented.", error=NotImplementedError, level=LogLevel.CRITICAL) + raise NotImplementedError("tINIT is not yet implemented.") model = reference_model properties = tINITProperties( reactions_scores=expr_vector, @@ -331,7 +329,7 @@ def _build_with_corda( :param neg_expression_threshold: Reactions expressed below this value will be placed in "negative" expression bin :param high_expression_threshold: Reactions expressed above this value will be placed in the "high" expression bin """ - log_and_raise_error("CORDA is not yet implemented", error=NotImplementedError, level=LogLevel.CRITICAL) + raise NotImplementedError("CORDA is not yet implemented") model = reference_model properties = CORDAProperties( high_conf_rx=[], @@ -450,12 +448,7 @@ def _read_reference_model(filepath: Path) -> cobra.Model: case ".json": reference_model = cobra.io.load_json_model(filepath) case _: - log_and_raise_error( - f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'", - error=ValueError, - level=LogLevel.ERROR, - ) - return reference_model + raise ValueError(f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'") async def _build_model( @@ -523,28 +516,16 @@ async def _build_model( ref_ub[i] = float(rxn.upper_bound) reaction_ids.append(rxn.id) if ref_lb.shape[0] != ref_ub.shape[0] != len(reaction_ids): - log_and_raise_error( - message=( - "Lower bounds, upper bounds, and reaction IDs must have the same length.\n" - f"Number of reactions: {len(reaction_ids)}\n" - f"Number of upper bounds: {ref_ub.shape[0]}\n" - f"Number of lower bounds: {ref_lb.shape[0]}" - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + "Lower bounds, upper bounds, and reaction IDs must have the same length.\n" + f"Number of reactions: {len(reaction_ids)}\n" + f"Number of upper bounds: {ref_ub.shape[0]}\n" + f"Number of lower bounds: {ref_lb.shape[0]}" ) if np.isnan(ref_lb).any(): - log_and_raise_error( - message="Lower bounds contains unfilled values!", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("Lower bounds contains unfilled values!") if np.isnan(ref_ub).any(): - log_and_raise_error( - message="Upper bounds contains unfilled values!", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("Upper bounds contains unfilled values!") # get expressed reactions reaction_expression: collections.OrderedDict[str, int] = await _map_expression_to_reaction( @@ -635,14 +616,10 @@ async def _build_model( idx_force=force_reaction_indices, ) else: - log_and_raise_error( - ( - f"Reconstruction algorithm must be {Algorithm.GIMME.value}, " - f"{Algorithm.FASTCORE.value}, {Algorithm.IMAT.value}, or {Algorithm.TINIT.value}. " - f"Got: {recon_algorithm.value}" - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Reconstruction algorithm must be {Algorithm.GIMME.value}, " + f"{Algorithm.FASTCORE.value}, {Algorithm.IMAT.value}, or {Algorithm.TINIT.value}. " + f"Got: {recon_algorithm.value}" ) inconsistent_and_infeasible_reactions: pd.DataFrame = pd.concat( @@ -690,13 +667,9 @@ async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: "minimum reaction rate", "maximum reaction rate", ]: - log_and_raise_error( - ( - f"Boundary reactions file must have columns named 'Reaction', 'Abbreviation', 'Compartment', " - f"'Minimum Reaction Rate', and 'Maximum Reaction Rate'. Found: {column}" - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Boundary reactions file must have columns named 'Reaction', 'Abbreviation', 'Compartment', " + f"'Minimum Reaction Rate', and 'Maximum Reaction Rate'. Found: {column}" ) reactions: list[str] = [""] * len(df) @@ -707,11 +680,7 @@ async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: for i in range(len(boundary_type)): boundary: str = boundary_type[i].lower() if boundary not in boundary_map: - log_and_raise_error( - f"Boundary reaction type must be 'Exchange', 'Demand', or 'Sink'. Found: {boundary}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Boundary reaction type must be 'Exchange', 'Demand', or 'Sink'. Found: {boundary}") shorthand_compartment = CobraCompartments.get_shorthand(reaction_compartment[i]) reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]" @@ -741,10 +710,8 @@ async def _write_model_to_disk( elif path.suffix in xml_suffix: tasks.add(asyncio.to_thread(cobra.io.write_sbml_model, model=model, filename=path)) else: - log_and_raise_error( - f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'", - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'" ) logger.success(f"Will save metabolic model for context '{context_name}' to: '{path}'") await asyncio.gather(*tasks) @@ -809,43 +776,19 @@ async def create_context_specific_model( # noqa: C901 output_model_filepaths = [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths if not reference_model.exists(): - log_and_raise_error( - f"Reference model not found at {reference_model}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Reference model not found at {reference_model}") if not active_genes_filepath.exists(): - log_and_raise_error( - f"Active genes file not found at {active_genes_filepath}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Active genes file not found at {active_genes_filepath}") if algorithm == Algorithm.FASTCORE and not output_fastcore_expression_index_filepath: - log_and_raise_error( - "The fastcore expression index output filepath must be provided", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("The fastcore expression index output filepath must be provided") if boundary_rxns_filepath and not boundary_rxns_filepath.exists(): - log_and_raise_error( - f"Boundary reactions file not found at {boundary_rxns_filepath}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Boundary reactions file not found at {boundary_rxns_filepath}") if algorithm not in Algorithm: - log_and_raise_error( - f"Algorithm {algorithm} not supported. Use one of {', '.join(a.value for a in Algorithm)}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Algorithm {algorithm} not supported. Use one of {', '.join(a.value for a in Algorithm)}") if solver not in Solver: - log_and_raise_error( - f"Solver '{solver}' not supported. Use one of {', '.join(s.value for s in Solver)}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Solver '{solver}' not supported. Use one of {', '.join(s.value for s in Solver)}") mat_suffix, json_suffix, xml_suffix = {".mat"}, {".json"}, {".sbml", ".xml"} if any(path.suffix not in {*mat_suffix, *json_suffix, *xml_suffix} for path in output_model_filepaths): @@ -858,6 +801,7 @@ async def create_context_specific_model( # noqa: C901 f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'", error=ValueError, level=LogLevel.ERROR, + raise ValueError(f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'") ) boundary_reactions = None @@ -869,11 +813,7 @@ async def create_context_specific_model( # noqa: C901 exclude_rxns_filepath: Path = Path(exclude_rxns_filepath) df = await _create_df(exclude_rxns_filepath) if "abbreviation" not in df.columns: - log_and_raise_error( - "The exclude reactions file should have a single column with a header named Abbreviation", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("The exclude reactions file should have a single column with a header named Abbreviation") exclude_rxns = df["abbreviation"].tolist() force_rxns: list[str] = [] @@ -881,11 +821,7 @@ async def create_context_specific_model( # noqa: C901 force_rxns_filepath: Path = Path(force_rxns_filepath) df = await _create_df(force_rxns_filepath, lowercase_col_names=True) if "abbreviation" not in df.columns: - log_and_raise_error( - "The force reactions file should have a single column with a header named Abbreviation", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("The force reactions file should have a single column with a header named Abbreviation") force_rxns = df["abbreviation"].tolist() # Test that gurobi is using a valid license file @@ -894,14 +830,10 @@ async def create_context_specific_model( # noqa: C901 gurobi_present = find_spec("gurobipy") if not gurobi_present: - log_and_raise_error( - message=( - "The gurobi solver requires the gurobipy package to be installed. " - "Please install gurobipy and try again. " - "This can be done by installing the 'gurobi' optional dependency." - ), - error=ImportError, - level=LogLevel.ERROR, + raise ImportError( + "The gurobi solver requires the gurobipy package to be installed. " + "Please install gurobipy and try again. " + "This can be done by installing the 'gurobi' optional dependency." ) if not Path(f"{os.environ['HOME']}/gurobi.lic").exists(): diff --git a/main/como/data_types.py b/main/como/data_types.py index a004f862..ca0f9b8c 100644 --- a/main/como/data_types.py +++ b/main/como/data_types.py @@ -226,10 +226,7 @@ def __setitem__(self, key, value): def _validate_attribute(self, key): if key not in {i.value for i in SourceTypes._member_map_.values()}: - # Unable to use como.utils._log_and_raise_error because it results in a circular import - message = f"{key} is not a valid attribute of {SourceTypes.__name__}; got '{key}'" - logger.warning(message) - raise ValueError(message) + raise ValueError(f"{key} is not a valid attribute of {SourceTypes.__name__}; got '{key}'") def __iter__(self) -> Iterator[tuple[SourceTypes, pd.DataFrame | None]]: """Iterate over matrix fields and their names. diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index 96d6a3a9..2cfdd1a1 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -70,11 +70,7 @@ def load_dummy_dict(): inquiry_full_path = Path(config.data_dir, "config_sheets", filename) if not inquiry_full_path.exists(): - log_and_raise_error( - f"Config file not found at {inquiry_full_path}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Config file not found at {inquiry_full_path}") filename: str = f"{prep_method.value}_{context_name}.csv" save_filepath = config.result_dir / context_name / prep_method.value / filename @@ -448,11 +444,7 @@ async def _process( elif adjust_method == AdjustmentMethod.FLAT: adjusted_expression_requirement = expression_requirement else: - log_and_raise_error( - message=f"Unknown `adjust_method`: {adjust_method}.", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Unknown `adjust_method`: {adjust_method}.") logger.debug(f"Adjusted expression requirement: {adjusted_expression_requirement}") if adjusted_expression_requirement != expression_requirement: @@ -513,10 +505,8 @@ def _build_batches( for study in sorted(metadata["study"].unique()): batch_search = re.search(r"\d+", study) if not batch_search: - log_and_raise_error( - message=f"Unable to find batch number in study name. Expected a digit in the study value: {study}", - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Unable to find batch number in study name. Expected a digit in the study value: {study}" ) batch_num = int(batch_search.group(0)) # ty: ignore[possibly-missing-attribute] @@ -542,11 +532,7 @@ def _validate_source_arguments( """ if any(i for i in args) and not all(i for i in args): - log_and_raise_error( - f"Must specify all or none of '{source.value}' arguments", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Must specify all or none of '{source.value}' arguments") async def merge_xomics( # noqa: C901 @@ -611,7 +597,7 @@ async def merge_xomics( # noqa: C901 proteomic_matrix_or_filepath, ) ): - log_and_raise_error("No data was passed!", error=ValueError, level=LogLevel.ERROR) + raise ValueError("No data was passed!") if expression_requirement and expression_requirement < 1: logger.warning(f"Expression requirement must be at least 1! Setting to the minimum of 1 now. Got: {expression_requirement}") diff --git a/main/como/proteomics/FTPManager.py b/main/como/proteomics/FTPManager.py index 86b27a07..8cca5095 100644 --- a/main/como/proteomics/FTPManager.py +++ b/main/como/proteomics/FTPManager.py @@ -17,7 +17,6 @@ from loguru import logger from como.proteomics.FileInformation import FileInformation, clear_print -from como.utils import log_and_raise_error from como.data_types import LogLevel @@ -43,11 +42,7 @@ async def aioftp_client(host: str, username: str = "anonymous", password: str = attempt_num += 1 time.sleep(5) if not connection_successful: - log_and_raise_error( - "Could not connect to FTP server", - error=ConnectionResetError, - level=LogLevel.ERROR, - ) + raise ConnectionResetError("Could not connect to FTP server") return client @@ -97,19 +92,11 @@ async def _get_info(self) -> None: if url_parse.hostname is not None: host = url_parse.hostname else: - log_and_raise_error( - f"Unable to identify hostname from url: {self._root_link}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Unable to identify hostname from url: {self._root_link}") if url_parse.path != "": folder = url_parse.path else: - log_and_raise_error( - f"Unable to identify folder or path from url: {self._root_link}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Unable to identify folder or path from url: {self._root_link}") client = await aioftp_client(host=host) for path, info in await client.list(folder, recursive=True): @@ -184,19 +171,11 @@ async def _aioftp_download_data(self, file_information: FileInformation, semapho if url_parse.hostname is not None: host = url_parse.hostname else: - log_and_raise_error( - f"Unable to identify hostname from url: {file_information.download_url}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Unable to identify hostname from url: {file_information.download_url}") if url_parse.path != "": folder = url_parse.path else: - log_and_raise_error( - f"Unable to identify folder or path from url: {file_information.download_url}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Unable to identify folder or path from url: {file_information.download_url}") # Convert file size from byte to MB size_mb: int = round(file_information.file_size / (1024**2)) diff --git a/main/como/proteomics/proteomics_preprocess.py b/main/como/proteomics/proteomics_preprocess.py index b3740ac9..04242443 100644 --- a/main/como/proteomics/proteomics_preprocess.py +++ b/main/como/proteomics/proteomics_preprocess.py @@ -9,7 +9,6 @@ from como.data_types import LogLevel from como.proteomics import Crux, FileInformation, FTPManager -from como.utils import log_and_raise_error class ArgParseFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): @@ -316,15 +315,13 @@ def parse_args() -> argparse.Namespace: # Validte the input file exists if not Path(args.input_csv).is_file(): - log_and_raise_error(f"Input file {args.input} does not exist!", error=FileNotFoundError, level=LogLevel.ERROR) + raise FileNotFoundError(f"Input file {args.input} does not exist!") if args.core_count == "all": args.core_count = os.cpu_count() elif not str(args.core_count).isdigit(): - log_and_raise_error( - f"Invalid option '{args.core_count}' for option '--cores'. Enter an integer or 'all' to use all cores", - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Invalid option '{args.core_count}' for option '--cores'. Enter an integer or 'all' to use all cores" ) else: diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py index a2714e69..cbb4531f 100644 --- a/main/como/proteomics_gen.py +++ b/main/como/proteomics_gen.py @@ -14,7 +14,7 @@ from como.data_types import LogLevel from como.project import Config from como.proteomics_preprocessing import protein_transform_main -from como.utils import log_and_raise_error, set_up_logging, return_placeholder_data +from como.utils import return_placeholder_data, set_up_logging # Load Proteomics @@ -31,17 +31,9 @@ def process_proteomics_data(path: Path) -> pd.DataFrame: matrix: pd.DataFrame = pd.read_csv(path) gene_symbol_colname = [col for col in matrix.columns if "symbol" in col] if len(gene_symbol_colname) == 0: - log_and_raise_error( - "No gene_symbol column found in proteomics data.", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("No gene_symbol column found in proteomics data.") if len(gene_symbol_colname) > 1: - log_and_raise_error( - "Multiple gene_symbol columns found in proteomics data.", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("Multiple gene_symbol columns found in proteomics data.") symbol_col = gene_symbol_colname[0] matrix = matrix.rename(columns={symbol_col: "gene_symbol"}) matrix["gene_symbol"] = matrix["gene_symbol"].astype(str) @@ -182,7 +174,7 @@ def load_empty_dict(): inquiry_full_path = config.data_dir / "config_sheets" / filename if not inquiry_full_path.exists(): - log_and_raise_error(f"Error: file not found {inquiry_full_path}", error=FileNotFoundError, level=LogLevel.ERROR) + raise FileNotFoundError(f"Error: file not found {inquiry_full_path}") filename = f"Proteomics_{context_name}.csv" full_save_filepath = config.result_dir / context_name / "proteomics" / filename @@ -217,37 +209,17 @@ async def proteomics_gen( set_up_logging(level=log_level, location=log_location) if not config_filepath.exists(): - log_and_raise_error( - f"Config file not found at {config_filepath}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Config file not found at {config_filepath}") if config_filepath.suffix not in (".xlsx", ".xls"): - log_and_raise_error( - f"Config file must be an xlsx or xls file at {config_filepath}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Config file must be an xlsx or xls file at {config_filepath}") if not matrix_filepath.exists(): - log_and_raise_error( - f"Matrix file not found at {matrix_filepath}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Matrix file not found at {matrix_filepath}") if matrix_filepath.suffix != ".csv": - log_and_raise_error( - f"Matrix file must be a csv file at {matrix_filepath}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Matrix file must be a csv file at {matrix_filepath}") if quantile < 0 or quantile > 100: - log_and_raise_error( - "Quantile must be an integer from 0 to 100", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("Quantile must be an integer from 0 to 100") quantile /= 100 config_df = pd.read_excel(config_filepath, sheet_name=context_name) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index ba1d3df7..3f8c00db 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -26,7 +26,7 @@ from como.migrations import gene_info_migrations from como.pipelines.identifier import convert from como.project import Config -from como.utils import log_and_raise_error, read_file, set_up_logging +from como.utils import read_file, set_up_logging class _FilteringOptions(NamedTuple): @@ -61,11 +61,7 @@ class _StudyMetrics: def __post_init__(self): for layout in self.layout: if layout not in LayoutMethod: - log_and_raise_error( - f"Layout must be 'paired-end' or 'single-end'; got: {layout}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Layout must be 'paired-end' or 'single-end'; got: {layout}") @property def normalization_matrix(self) -> pd.DataFrame: @@ -139,11 +135,7 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr A NumPy array of the filtered data. """ if not isinstance(data, pd.DataFrame | np.ndarray): - log_and_raise_error( - f"Unsupported data type. Must be a Pandas DataFrame or a NumPy array, got '{type(data)}'", - error=TypeError, - level=LogLevel.CRITICAL, - ) + raise TypeError(f"Unsupported data type. Must be a Pandas DataFrame or a NumPy array, got '{type(data)}'") return ( data.apply(filter_func, axis=1).to_numpy() @@ -270,11 +262,7 @@ async def _build_matrix_results( entrez_gene_ids = subset.var["entrez_gene_id"].to_numpy(dtype=int) gene_sizes = subset.var["size"].to_numpy(dtype=int) else: - log_and_raise_error( - message=f"Matrix must be a pandas DataFrame or scanpy AnnData object, got: '{type(matrix)}'.", - error=TypeError, - level=LogLevel.CRITICAL, - ) + raise TypeError(f"Matrix must be a pandas DataFrame or scanpy AnnData object, got: '{type(matrix)}'.") frag_lengths = None if fragment_df is not None: @@ -338,11 +326,7 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics: matrix_values: dict[str, npt.NDArray[np.floating]] = {} count_matrix = metrics[study].count_matrix if not isinstance(count_matrix, pd.DataFrame): - log_and_raise_error( - message="FPKM cannot be performed on scanpy.AnnData objects!", - error=TypeError, - level=LogLevel.CRITICAL, - ) + raise TypeError("FPKM cannot be performed on scanpy.AnnData objects!") study_counts = count_matrix.to_numpy(dtype=int, copy=False) for i in range(metrics[study].num_samples): @@ -712,11 +696,7 @@ def filter_counts( perform_normalization=umi_perform_normalization, ) case _: - log_and_raise_error( - f"Technique must be one of {FilteringTechnique}, got '{technique.value}'", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Technique must be one of {FilteringTechnique}, got '{technique.value}'") async def _process( @@ -911,19 +891,11 @@ async def rnaseq_gen( # noqa: C901 case FilteringTechnique.TPM: cutoff: int | float = cutoff or 25 if cutoff < 1 or cutoff > 100: - log_and_raise_error( - "Quantile must be between 1 - 100", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("Quantile must be between 1 - 100") case FilteringTechnique.CPM: if cutoff and cutoff < 0: - log_and_raise_error( - "Cutoff must be greater than or equal to 0", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("Cutoff must be greater than or equal to 0") elif cutoff: cutoff = "default" @@ -932,18 +904,10 @@ async def rnaseq_gen( # noqa: C901 case FilteringTechnique.UMI: cutoff: int = cutoff or 1 case _: - log_and_raise_error( - f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}") if not input_rnaseq_filepath.exists(): - log_and_raise_error( - f"Input RNA-seq file not found! Searching for: '{input_rnaseq_filepath}'", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Input RNA-seq file not found! Searching for: '{input_rnaseq_filepath}'") if prep == RNAType.SCRNA and technique.value.lower() != FilteringTechnique.UMI.value.lower(): logger.warning( @@ -957,24 +921,17 @@ async def rnaseq_gen( # noqa: C901 metadata_df = input_metadata_filepath_or_df elif isinstance(input_metadata_filepath_or_df, Path): if input_metadata_filepath_or_df.suffix not in {".xlsx", ".xls"}: - log_and_raise_error( - f"Expected an excel file with extension of '.xlsx' or '.xls', got '{input_metadata_filepath_or_df.suffix}'.", - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Expected an excel file with extension of '.xlsx' or '.xls', " + f"got '{input_metadata_filepath_or_df.suffix}'" ) if not input_metadata_filepath_or_df.exists(): - log_and_raise_error( - f"Input metadata file not found! Searching for: '{input_metadata_filepath_or_df}'", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Input metadata file not found! Searching for: '{input_metadata_filepath_or_df}'") metadata_df = pd.read_excel(input_metadata_filepath_or_df) else: - log_and_raise_error( - f"Expected a pandas DataFrame or Path object as metadata, got '{type(input_metadata_filepath_or_df)}'", - error=TypeError, - level=LogLevel.ERROR, + raise TypeError( + f"Expected a pandas DataFrame or Path object as metadata, got '{type(input_metadata_filepath_or_df)}'" ) logger.debug(f"Starting '{context_name}'") diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 6faf4862..24d5e8b6 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -19,7 +19,7 @@ from como.data_types import LogLevel, RNAType from como.pipelines.identifier import convert -from como.utils import log_and_raise_error, read_file, set_up_logging +from como.utils import read_file, set_up_logging @dataclass @@ -32,17 +32,9 @@ class _QuantInformation: @classmethod def build_from_sf(cls, filepath: Path) -> _QuantInformation: if filepath.suffix != ".sf": - log_and_raise_error( - f"Building quantification information requires a '.sf' file; received: '{filepath}'", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Building quantification information requires a '.sf' file; received: '{filepath}'") if not filepath.exists(): - log_and_raise_error( - f"Unable to find the .sf file: {filepath}", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"Unable to find the .sf file: {filepath}") sample_name = filepath.stem.removesuffix("_quant.genes") df = read_file( @@ -80,41 +72,25 @@ def __post_init__(self): self.__sample_names = [f.stem for f in self.quant_files] if len(self.quant_files) != len(self.strand_files): - log_and_raise_error( - ( - f"Unequal number of count files and strand files for study '{self.study_name}'. " - f"Found {len(self.quant_files)} count files and {len(self.strand_files)} strand files." - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Unequal number of count files and strand files for study '{self.study_name}'. " + f"Found {len(self.quant_files)} count files and {len(self.strand_files)} strand files." ) if self.num_samples != len(self.quant_files): - log_and_raise_error( - ( - f"Unequal number of samples and count files for study '{self.study_name}'. " - f"Found {self.num_samples} samples and {len(self.quant_files)} count files." - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Unequal number of samples and count files for study '{self.study_name}'. " + f"Found {self.num_samples} samples and {len(self.quant_files)} count files." ) if self.num_samples != len(self.strand_files): - log_and_raise_error( - ( - f"Unequal number of samples and strand files for study '{self.study_name}'. " - f"Found {self.num_samples} samples and {len(self.strand_files)} strand files." - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Unequal number of samples and strand files for study '{self.study_name}'. " + f"Found {self.num_samples} samples and {len(self.strand_files)} strand files." ) if self.__num_samples == 1: - log_and_raise_error( - f"Only one sample exists for study {self.study_name}. Provide at least two samples", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Only one sample exists for study {self.study_name}. Provide at least two samples") self.quant_files.sort() self.strand_files.sort() @@ -134,26 +110,16 @@ class SampleConfiguration: def __post_init__(self): """Validate the effective lengths dataframe to ensure it has the expected structure and content.""" if len(self.effective_lengths.columns) > 2: - log_and_raise_error( - message=( - f"Effective lengths dataframe for sample '{self.sample_name}' has more than 2 columns, " - f"expected 'name' and 'effective_length'" - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Effective lengths dataframe for sample '{self.sample_name}' has more than 2 columns, " + f"expected 'name' and 'effective_length'" ) + if "name" not in self.effective_lengths.columns: - log_and_raise_error( - message=f"Effective lengths dataframe for sample '{self.sample_name}' is missing 'name' column", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Effective lengths dataframe for sample '{self.sample_name}' is missing 'name' column") + if "effective_length" not in self.effective_lengths.columns: - log_and_raise_error( - message=f"Sample '{self.sample_name}' is missing 'effective_length' column", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Sample '{self.sample_name}' is missing 'effective_length' column") @classmethod def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, pd.DataFrame]: @@ -186,21 +152,13 @@ def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, def _sample_name_from_filepath(file: Path) -> str: group = re.search(r".+_S\d+R\d+(r\d+)?", file.stem) if not group: - log_and_raise_error( - message=( - "Filename does not match expected pattern 'contextName_SXRYrZ' where " - "X is the study number, Y is the replicate number, and Z is the optional run number" - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + "Filename does not match expected pattern 'contextName_SXRYrZ' where " + "X is the study number, Y is the replicate number, and Z is the optional run number" ) return group.group() -def _sample_name_from_filepath(file: Path) -> str: - return re.search(r".+_S\d+R\d+(r\d+)?", file.stem).group() - - def _require_one( paths: list[Path | None], kind: Literal["layout", "strand", "preparation", "fragment"], @@ -218,7 +176,7 @@ def _require_one( else: message = f"No {kind} file found for {label}, make sure there is one copy for each replicate in COMO_input" - log_and_raise_error(message=message, error=ValueError, level=LogLevel.ERROR) + raise ValueError(message) def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: @@ -235,15 +193,11 @@ def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: strandedness_directories: list[Path] = sorted([p for p in strand_dir.glob("*") if not p.name.startswith(".")]) if len(quantification_directories) != len(strandedness_directories): - log_and_raise_error( - ( - f"Unequal number of quantification directories and strandedness directories. " - f"Found {len(quantification_directories)} quantification directories and " - f"{len(strandedness_directories)} strandedness directories." - f"\nQuantification directory: {quant_dir}\nStrandedness directory: {strand_dir}" - ), - error=ValueError, - level=LogLevel.ERROR, + raise ValueError( + f"Unequal number of quantification directories and strandedness directories. " + f"Found {len(quantification_directories)} quantification directories and " + f"{len(strandedness_directories)} strandedness directories." + f"\nQuantification directory: {quant_dir}\nStrandedness directory: {strand_dir}" ) # For each study, collect gene count files, fragment files, insert size files, layouts, and strandedness information @@ -252,13 +206,9 @@ def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: quant_files = list(quant.glob("*_quant.genes.sf")) strand_files = list(strand_dir.glob("*.txt")) if len(quant_files) == 0: - log_and_raise_error(f"No quant found for study '{quant.stem}'.", error=ValueError, level=LogLevel.ERROR) + raise ValueError(f"No quant found for study '{quant.stem}'.") if len(strand_files) == 0: - log_and_raise_error( - f"No strandedness files found for study '{quant.stem}'.", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"No strandedness files found for study '{quant.stem}'.") study_metrics.append( _StudyMetrics( @@ -425,11 +375,7 @@ def _create_config_df( # noqa: C901 quant_files: list[Path] = list((como_context_dir / quantification_dir).rglob("*.genes.sf")) # gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab")) if not quant_files: - log_and_raise_error( - f"No gene count files found in '{gene_count_dirname}'", - error=FileNotFoundError, - level=LogLevel.ERROR, - ) + raise FileNotFoundError(f"No gene count files found in '{gene_count_dirname}'") auxillary_directories = { "layout": como_context_dir / layout_dirname, @@ -453,11 +399,8 @@ def _create_config_df( # noqa: C901 for quant_file in sorted(quant_files): m = label_regex.search(quant_file.as_posix()) if m is None: - log_and_raise_error( - f"Filename '{quant_file.name}' does not match contextName_SXRYrZ.tab pattern", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Filename '{quant_file.name}' does not match contextName_SXRYrZ.tab pattern") + label = m.group() study_number = m["study"] rep_number = m["rep"] @@ -471,17 +414,9 @@ def _create_config_df( # noqa: C901 strand = strand_path.read_text().rstrip() prep = prep_path.read_text().rstrip() if prep not in {"total", "mrna"}: - log_and_raise_error( - f"Prep method must be 'total' or 'mrna' (got '{prep}') for {label}", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError(f"Prep method must be 'total' or 'mrna' (got '{prep}') for {label}") if layout == "": - log_and_raise_error( - message=f"No layout file found for '{label}'.", - error=FileNotFoundError, - level=LogLevel.WARNING, - ) + raise FileNotFoundError(message=f"No layout file found for '{label}'.") quant_paths = [p for p in aux_lookup["quantification"].values() if p.name == f"{sample_id}_quant.genes.sf"] if ( @@ -540,11 +475,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: try: conversion = convert(ids=data_.var_names.tolist(), taxon=taxon) except json.JSONDecodeError as e: - log_and_raise_error( - f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})", - error=ValueError, - level=LogLevel.CRITICAL, - ) + raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})") # Remove NA values from entrez_gene_id dataframe column conversion = conversion[~conversion["ensembl_gene_id"].isna()] @@ -706,17 +637,9 @@ async def _process( # if provided, iterate through como-input specific directories if not create_gene_info_only: if como_context_dir is None: - log_and_raise_error( - message="como_context_dir must be provided if create_gene_info_only is False", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("como_context_dir must be provided if create_gene_info_only is False") if output_trna_fragment_lengths_filepath is None: - log_and_raise_error( - message="output_fragment_lengths_filepath must be provided if create_gene_info_only is False", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("output_fragment_lengths_filepath must be provided if create_gene_info_only is False") for rna, out_config, out_matrix, out_frag_len in rna_types: _process_como_input( @@ -789,11 +712,7 @@ async def rnaseq_preprocess( # noqa: C901 # ruff: disable[ASYNC240] if not output_gene_info_filepath: - log_and_raise_error( - message="output_gene_info_filepath must be provided", - error=ValueError, - level=LogLevel.ERROR, - ) + raise ValueError("output_gene_info_filepath must be provided") output_gene_info_filepath = Path(output_gene_info_filepath).resolve() diff --git a/main/como/utils.py b/main/como/utils.py index 996adef1..98daca16 100644 --- a/main/como/utils.py +++ b/main/como/utils.py @@ -140,13 +140,9 @@ def get_missing_gene_data(values: Sequence[str] | pd.DataFrame | sc.AnnData, tax # raise error if duplicate column names exist if any(values.columns.duplicated(keep=False)): duplicate_cols = values.columns[values.columns.duplicated(keep=False)].unique().tolist() - log_and_raise_error( - message=( - f"Duplicate column names exist! This will result in an error processing data. " - f"Duplicates: {','.join(duplicate_cols)}" - ), - error=ValueError, - level=LogLevel.CRITICAL, + raise ValueError( + f"Duplicate column names exist! This will result in an error processing data. " + f"Duplicates: {','.join(duplicate_cols)}" ) names: list[str] = values.columns.tolist() @@ -168,17 +164,11 @@ def get_missing_gene_data(values: Sequence[str] | pd.DataFrame | sc.AnnData, tax taxon_id=taxon_id, ) else: - log_and_raise_error( - message="Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.", - error=ValueError, - level=LogLevel.CRITICAL, + raise ValueError( + "Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix." ) else: - log_and_raise_error( - message=f"Values must be a list of strings or a pandas DataFrame, got: {type(values)}", - error=TypeError, - level=LogLevel.CRITICAL, - ) + raise ValueError(f"Values must be a list of strings or a pandas DataFrame, got: {type(values)}") @overload @@ -239,7 +229,7 @@ def read_file( # noqa: C901 return None if isinstance(path, Path) and not path.exists(): - log_and_raise_error(f"File {path} does not exist", error=FileNotFoundError, level=LogLevel.CRITICAL) + raise FileNotFoundError(f"File not found: '{path}'") match path.suffix: case ".csv" | ".tsv" | ".txt" | ".tab" | ".sf": @@ -257,11 +247,8 @@ def read_file( # noqa: C901 return df return adata case _: - log_and_raise_error( - f"Unknown file extension '{path.suffix}'. " - "Valid options are '.tsv', '.csv', '.xlsx', '.xls', or '.h5ad'", - error=ValueError, - level=LogLevel.CRITICAL, + raise ValueError( + f"Unknown file extension '{path.suffix}'. Valid options are '.tsv', '.csv', '.xlsx', '.xls', or '.h5ad'" ) @@ -323,26 +310,3 @@ def set_up_logging( with contextlib.suppress(ValueError): logger.remove(0) logger.add(sink=location, level=level.value, format=formatting) - - -def log_and_raise_error( - message: str, - *, - error: type[BaseException], - level: LogLevel, -) -> NoReturn: - """Log an error message and raise an exception. - - :param message: The error message to log and include in the raised exception - :param error: The type of exception to raise (e.g., ValueError, File NotFoundError, etc.) - :param level: The LogLevel at which to log the error message (e.g., LogLevel.ERROR, LogLevel.CRITICAL) - """ - caller = logger.opt(depth=1) - if level == LogLevel.ERROR: - caller.error(message) - raise error(message) - if level == LogLevel.CRITICAL: - caller.critical(message) - raise error(message) - - raise ValueError(f"When raising an error, LogLevel.ERROR or LogLevel.CRITICAL must be used. Got: {level}")