diff --git a/.gitignore b/.gitignore index 4f1cc30..48cb224 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,6 @@ nosetests.xml # docs docs/_build/* -docs/api/generated/* \ No newline at end of file +docs/api/generated/* + +tools/* \ No newline at end of file diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 81e1134..3439fca 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -456,10 +456,13 @@ class IPFIOConfig(BaseIOConfig): ipf_ms2_scoring (bool): Use MS2 precursor data for IPF. ipf_h0 (bool): Include possibility that peak groups are not covered by the peptidoform space (null hypothesis H0). ipf_grouped_fdr (bool): [Experimental] Compute grouped FDR instead of pooled FDR to support heterogeneous peptidoform counts per peak group. + ipf_grouped_fdr_strategy (Literal["num_peptidoforms"]): Grouping strategy used when grouped FDR is enabled. ipf_max_precursor_pep (float): Maximum PEP to consider scored precursors in IPF. ipf_max_peakgroup_pep (float): Maximum PEP to consider scored peak groups in IPF. ipf_max_precursor_peakgroup_pep (float): Maximum BHM layer 1 integrated precursor-peakgroup PEP to consider in IPF. ipf_max_transition_pep (float): Maximum PEP to consider scored transitions in IPF. + ipf_min_supporting_transitions (int): Minimum number of supporting transitions required to keep an inferred peptidoform result. + ipf_min_peakgroup_intensity (float): Minimum MS2 peakgroup area intensity required to keep an inferred peptidoform result. propagate_signal_across_runs (bool): Propagate signal across runs (requires alignment step). ipf_max_alignment_pep (float): Maximum PEP to consider for good alignments. across_run_confidence_threshold (float): Maximum PEP threshold for propagating signal across runs for aligned features. @@ -471,10 +474,13 @@ class IPFIOConfig(BaseIOConfig): ipf_ms2_scoring: bool = True ipf_h0: bool = True ipf_grouped_fdr: bool = False + ipf_grouped_fdr_strategy: Literal["num_peptidoforms"] = "num_peptidoforms" ipf_max_precursor_pep: float = 0.7 ipf_max_peakgroup_pep: float = 0.7 ipf_max_precursor_peakgroup_pep: float = 0.4 ipf_max_transition_pep: float = 0.6 + ipf_min_supporting_transitions: int = 0 + ipf_min_peakgroup_intensity: float = 0.0 propagate_signal_across_runs: bool = False ipf_max_alignment_pep: float = 0.7 across_run_confidence_threshold: float = 0.5 @@ -493,10 +499,13 @@ def from_cli_args( ipf_ms2_scoring, ipf_h0, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, ipf_max_precursor_pep, ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep, + ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity, propagate_signal_across_runs, ipf_max_alignment_pep, across_run_confidence_threshold, @@ -516,10 +525,13 @@ def from_cli_args( ipf_ms2_scoring=ipf_ms2_scoring, ipf_h0=ipf_h0, ipf_grouped_fdr=ipf_grouped_fdr, + ipf_grouped_fdr_strategy=ipf_grouped_fdr_strategy, ipf_max_precursor_pep=ipf_max_precursor_pep, ipf_max_peakgroup_pep=ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep=ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep=ipf_max_transition_pep, + ipf_min_supporting_transitions=ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity=ipf_min_peakgroup_intensity, propagate_signal_across_runs=propagate_signal_across_runs, ipf_max_alignment_pep=ipf_max_alignment_pep, across_run_confidence_threshold=across_run_confidence_threshold, diff --git a/pyprophet/cli/ipf.py b/pyprophet/cli/ipf.py index 3a33651..26cd4b1 100644 --- a/pyprophet/cli/ipf.py +++ b/pyprophet/cli/ipf.py @@ -48,6 +48,13 @@ show_default=True, help="[Experimental] Compute grouped FDR instead of pooled FDR to better support data where peak groups are evaluated to originate from very heterogeneous numbers of peptidoforms.", ) +@click.option( + "--ipf_grouped_fdr_strategy", + default="num_peptidoforms", + show_default=True, + type=click.Choice(["num_peptidoforms"]), + help="Grouping strategy used when --ipf_grouped_fdr is enabled.", +) @click.option( "--ipf_max_precursor_pep", default=0.7, @@ -76,6 +83,20 @@ type=float, help="Maximum PEP to consider scored transitions in IPF.", ) +@click.option( + "--ipf_min_supporting_transitions", + default=0, + show_default=True, + type=int, + help="Minimum number of supporting identifying transitions required to keep an inferred peptidoform result. Applied as a post-IPF filter; 0 disables the filter.", +) +@click.option( + "--ipf_min_peakgroup_intensity", + default=0.0, + show_default=True, + type=float, + help="Minimum FEATURE_MS2 area intensity required to keep an inferred peptidoform result. Applied as a post-IPF filter; 0 disables the filter.", +) @click.option( "--propagate_signal_across_runs/--no-propagate_signal_across_runs", default=False, @@ -120,10 +141,13 @@ def ipf( ipf_ms2_scoring, ipf_h0, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, ipf_max_precursor_pep, ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep, + ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity, propagate_signal_across_runs, ipf_max_alignment_pep, across_run_confidence_threshold, @@ -155,10 +179,13 @@ def ipf( ipf_ms2_scoring, ipf_h0, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, ipf_max_precursor_pep, ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep, + ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity, propagate_signal_across_runs, ipf_max_alignment_pep, across_run_confidence_threshold, diff --git a/pyprophet/glyco/glycoform.py b/pyprophet/glyco/glycoform.py index f5fe832..3825df6 100644 --- a/pyprophet/glyco/glycoform.py +++ b/pyprophet/glyco/glycoform.py @@ -15,28 +15,105 @@ from .pepmass import GlycoPeptideMassCalculator -def get_feature_mapping_across_runs(infile, ipf_max_alignment_pep=1): +def get_feature_mapping_across_runs( + infile, max_alignment_pep=0.5, min_mapping_confidence=None +): click.echo("Info: Reading Across Run Feature Alignment Mapping ... ", nl=False) start = time.time() - con = sqlite3.connect(infile) + with sqlite3.connect(infile) as con: + use_alignment_candidates = ( + min_mapping_confidence is not None + and check_sqlite_table(con, "FEATURE_MS2_ALIGNMENT_CANDIDATE") + ) - data = pd.read_sql_query( - f"""SELECT - DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, - ALIGNED_FEATURE_ID AS FEATURE_ID - FROM (SELECT DISTINCT * FROM FEATURE_MS2_ALIGNMENT) AS FEATURE_MS2_ALIGNMENT - INNER JOIN - (SELECT DISTINCT *, MIN(QVALUE) FROM SCORE_ALIGNMENT GROUP BY FEATURE_ID) AS SCORE_ALIGNMENT - ON SCORE_ALIGNMENT.FEATURE_ID = FEATURE_MS2_ALIGNMENT.ALIGNED_FEATURE_ID - WHERE LABEL = 1 - AND SCORE_ALIGNMENT.PEP < {ipf_max_alignment_pep} - ORDER BY ALIGNMENT_GROUP_ID""", - con, - ) + if use_alignment_candidates: + query = """ + SELECT + DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, + ALIGNMENT_ID, + FEATURE_ID, + PRECURSOR_ID, + FEATURE_TYPE + FROM ( + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + REFERENCE_FEATURE_ID AS FEATURE_ID, + 'REFERENCE' AS FEATURE_TYPE + FROM FEATURE_MS2_ALIGNMENT_CANDIDATE + WHERE SELECTED = 1 + AND MAPPING_CONFIDENCE >= ? + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + AND ALIGNED_FEATURE_ID != -1 + + UNION + + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + ALIGNED_FEATURE_ID AS FEATURE_ID, + 'QUERY' AS FEATURE_TYPE + FROM FEATURE_MS2_ALIGNMENT_CANDIDATE + WHERE SELECTED = 1 + AND MAPPING_CONFIDENCE >= ? + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + AND ALIGNED_FEATURE_ID != -1 + ) AS feature_list + ORDER BY + ALIGNMENT_GROUP_ID, + CASE FEATURE_TYPE + WHEN 'REFERENCE' THEN 0 + WHEN 'QUERY' THEN 1 + END + """ + data = pd.read_sql_query( + query, + con, + params=[min_mapping_confidence, min_mapping_confidence], + ) + else: + if not check_sqlite_table(con, "FEATURE_MS2_ALIGNMENT") or not check_sqlite_table( + con, "SCORE_ALIGNMENT" + ): + raise click.ClickException( + "Perform feature alignment using ARYCAL, and apply scoring to alignment-level data before running glycoform inference." + ) + + query = f""" + SELECT + DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, + FEATURE_ID + FROM ( + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + REFERENCE_FEATURE_ID AS FEATURE_ID + FROM FEATURE_MS2_ALIGNMENT + WHERE LABEL = 1 + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + + UNION + + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + ALIGNED_FEATURE_ID AS FEATURE_ID + FROM FEATURE_MS2_ALIGNMENT + WHERE LABEL = 1 + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + ) AS feature_list + INNER JOIN ( + SELECT DISTINCT FEATURE_ID + FROM SCORE_ALIGNMENT + WHERE PEP < {max_alignment_pep} + ) AS good_alignments + ON good_alignments.FEATURE_ID = feature_list.FEATURE_ID + ORDER BY ALIGNMENT_GROUP_ID + """ + data = pd.read_sql_query(query, con) data.columns = [col.lower() for col in data.columns] - con.close() end = time.time() click.echo(f"{end-start:.4f} seconds") @@ -604,7 +681,7 @@ def infer_glycoforms( ## prepare for propagating signal across runs for aligned features if propagate_signal_across_runs: across_run_feature_map = get_feature_mapping_across_runs( - infile, max_alignment_pep + infile, max_alignment_pep=max_alignment_pep ) transition_table = pd.merge( transition_table, across_run_feature_map, on="feature_id", how="left" diff --git a/pyprophet/io/ipf/osw.py b/pyprophet/io/ipf/osw.py index 98e570a..bb18100 100644 --- a/pyprophet/io/ipf/osw.py +++ b/pyprophet/io/ipf/osw.py @@ -7,7 +7,7 @@ import pandas as pd import click from loguru import logger -from ..util import check_sqlite_table, check_duckdb_table +from ..util import check_sqlite_table, check_duckdb_table, get_table_columns from .._base import BaseOSWReader, BaseOSWWriter from ..._config import IPFIOConfig @@ -121,10 +121,26 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 + intensity_select = ( + ",\n FEATURE_MS2.AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + feature_ms2_join = ( + "\n INNER JOIN osw.FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID" + if add_intensity + else "" + ) # precursors are restricted according to ipf_max_peakgroup_pep to exclude very poor peak groups logger.info("Reading precursor-level data ...") + if add_intensity and not check_duckdb_table(con, "main", "FEATURE_MS2"): + raise click.ClickException( + "FEATURE_MS2 is required for peakgroup-intensity IPF filtering." + ) + if not ipf_ms1 and ipf_ms2: # only use MS2 precursors if not check_duckdb_table( con, "main", "SCORE_MS2" @@ -136,9 +152,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( SELECT FEATURE_ID, PEP @@ -161,9 +178,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < {pep_threshold} @@ -183,9 +201,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( @@ -209,9 +228,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < {pep_threshold} """ @@ -223,19 +243,42 @@ def _read_pyp_transition_duckdb(self, con): rc = self.config ipf_h0 = rc.ipf_h0 pep_threshold = rc.ipf_max_transition_pep + has_annotation = "ANNOTATION" in get_table_columns(self.infile, "TRANSITION") + phospho_loss_expr = ( + "CASE WHEN COALESCE(TRANSITION.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END" + if has_annotation + else "0" + ) # only the evidence is restricted to ipf_max_transition_pep, the peptidoform-space is complete logger.info("Info: Reading peptidoform-level data ...") queries = { "evidence": f""" - SELECT FEATURE_ID, TRANSITION_ID, PEP + SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP FROM osw.SCORE_TRANSITION INNER JOIN osw.TRANSITION ON SCORE_TRANSITION.TRANSITION_ID = TRANSITION.ID WHERE TRANSITION.TYPE != '' AND TRANSITION.DECOY = 0 AND PEP < {pep_threshold} """, + "transition_meta": """ + WITH mapped_counts AS ( + SELECT + TRANSITION_ID, + COUNT(DISTINCT PEPTIDE_ID) AS N_MAPPED_PEPTIDES + FROM osw.TRANSITION_PEPTIDE_MAPPING + GROUP BY TRANSITION_ID + ) + SELECT DISTINCT + TRANSITION.ID AS TRANSITION_ID, + COALESCE(mapped_counts.N_MAPPED_PEPTIDES, 0) AS N_MAPPED_PEPTIDES, + {phospho_loss_expr} AS HAS_PHOSPHO_LOSS + FROM osw.TRANSITION + LEFT JOIN mapped_counts ON TRANSITION.ID = mapped_counts.TRANSITION_ID + WHERE TRANSITION.TYPE != '' + AND TRANSITION.DECOY = 0 + """.format(phospho_loss_expr=phospho_loss_expr), "bitmask": """ SELECT DISTINCT TRANSITION.ID AS TRANSITION_ID, PEPTIDE_ID, 1 AS BMASK FROM osw.SCORE_TRANSITION @@ -265,6 +308,9 @@ def _read_pyp_transition_duckdb(self, con): # Execute evidence = con.execute(queries["evidence"]).fetchdf().rename(columns=str.lower) + transition_meta = ( + con.execute(queries["transition_meta"]).fetchdf().rename(columns=str.lower) + ) bitmask = con.execute(queries["bitmask"]).fetchdf().rename(columns=str.lower) num_peptidoforms = ( con.execute(queries["num_peptidoforms"]).fetchdf().rename(columns=str.lower) @@ -290,6 +336,7 @@ def _read_pyp_transition_duckdb(self, con): # Merge trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) @@ -420,6 +467,22 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 + intensity_select = ( + ",\n FEATURE_MS2.AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + feature_ms2_join = ( + "\n INNER JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID" + if add_intensity + else "" + ) + + if add_intensity and not check_sqlite_table(con, "FEATURE_MS2"): + raise click.ClickException( + "FEATURE_MS2 is required for peakgroup-intensity IPF filtering." + ) if not ipf_ms1 and ipf_ms2: # only use MS2 precursors if not check_sqlite_table(con, "SCORE_MS2") or not check_sqlite_table( @@ -433,9 +496,10 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( SELECT FEATURE_ID, PEP @@ -458,9 +522,10 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < ? @@ -479,9 +544,10 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( @@ -498,33 +564,63 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < ? """ - df = pd.read_sql_query(query, con, params=[pep_threshold]) + df = pd.read_sql_query( + query.format( + intensity_select=intensity_select, feature_ms2_join=feature_ms2_join + ), + con, + params=[pep_threshold], + ) return df.rename(columns=str.lower) def _read_pyp_transition_sqlite(self, con): rc = self.config ipf_h0 = rc.ipf_h0 pep_threshold = rc.ipf_max_transition_pep + has_annotation = "ANNOTATION" in get_table_columns(self.infile, "TRANSITION") + phospho_loss_expr = ( + "CASE WHEN COALESCE(TRANSITION.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END" + if has_annotation + else "0" + ) # only the evidence is restricted to ipf_max_transition_pep, the peptidoform-space is complete logger.info("Info: Reading peptidoform-level data ...") queries = { "evidence": """ - SELECT FEATURE_ID, TRANSITION_ID, PEP + SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP FROM SCORE_TRANSITION INNER JOIN TRANSITION ON SCORE_TRANSITION.TRANSITION_ID = TRANSITION.ID WHERE TRANSITION.TYPE != '' AND TRANSITION.DECOY = 0 AND PEP < ? """, + "transition_meta": """ + WITH mapped_counts AS ( + SELECT + TRANSITION_ID, + COUNT(DISTINCT PEPTIDE_ID) AS N_MAPPED_PEPTIDES + FROM TRANSITION_PEPTIDE_MAPPING + GROUP BY TRANSITION_ID + ) + SELECT DISTINCT + TRANSITION.ID AS TRANSITION_ID, + COALESCE(mapped_counts.N_MAPPED_PEPTIDES, 0) AS N_MAPPED_PEPTIDES, + {phospho_loss_expr} AS HAS_PHOSPHO_LOSS + FROM TRANSITION + LEFT JOIN mapped_counts ON TRANSITION.ID = mapped_counts.TRANSITION_ID + WHERE TRANSITION.TYPE != '' + AND TRANSITION.DECOY = 0 + """.format(phospho_loss_expr=phospho_loss_expr), "bitmask": """ SELECT DISTINCT TRANSITION.ID AS TRANSITION_ID, PEPTIDE_ID, 1 AS BMASK FROM SCORE_TRANSITION @@ -554,7 +650,12 @@ def _read_pyp_transition_sqlite(self, con): # Execute evidence = pd.read_sql_query( - queries["evidence"], con, params=[pep_threshold] + queries["evidence"], + con, + params=[pep_threshold], + ).rename(columns=str.lower) + transition_meta = pd.read_sql_query( + queries["transition_meta"], con ).rename(columns=str.lower) bitmask = pd.read_sql_query(queries["bitmask"], con).rename(columns=str.lower) num_peptidoforms = pd.read_sql_query(queries["num_peptidoforms"], con).rename( @@ -581,6 +682,7 @@ def _read_pyp_transition_sqlite(self, con): # Merge trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) diff --git a/pyprophet/io/ipf/parquet.py b/pyprophet/io/ipf/parquet.py index f97e6bd..87912f0 100644 --- a/pyprophet/io/ipf/parquet.py +++ b/pyprophet/io/ipf/parquet.py @@ -71,10 +71,21 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 logger.info("Reading precursor-level data ...") all_cols = get_parquet_column_names(self.infile) + intensity_select = ( + ",\n FEATURE_MS2_AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + + if add_intensity and "FEATURE_MS2_AREA_INTENSITY" not in all_cols: + raise click.ClickException( + "FEATURE_MS2_AREA_INTENSITY is required for peakgroup-intensity IPF filtering." + ) if not ipf_ms1 and ipf_ms2: if not any(c.startswith("SCORE_MS2_") for c in all_cols) or not any( @@ -85,7 +96,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND TRANSITION_TYPE = '' @@ -102,7 +113,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND SCORE_MS2_PEP < {pep_threshold} @@ -123,7 +134,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND TRANSITION_TYPE = '' @@ -140,7 +151,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND SCORE_MS2_PEP < {pep_threshold} @@ -162,12 +173,24 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: raise click.ClickException( "IPF_PEPTIDE_ID column is required in transition features." ) + has_annotation = "ANNOTATION" in all_cols + has_overlap_col = "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE" in all_cols + has_phospho_loss_sql = ( + "MAX(CASE WHEN COALESCE(ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END) AS HAS_PHOSPHO_LOSS" + if has_annotation + else "0 AS HAS_PHOSPHO_LOSS" + ) + overlap_select = ( + ", FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" + if has_overlap_col + else "" + ) # Use DuckDB view `data` created in _init_duckdb_views() # Evidence table: transition-level PEPs query = f""" - SELECT FEATURE_ID, TRANSITION_ID, SCORE_TRANSITION_PEP AS PEP + SELECT FEATURE_ID, TRANSITION_ID, SCORE_TRANSITION_PEP AS PEP{overlap_select} FROM data WHERE TRANSITION_TYPE != '' AND TRANSITION_DECOY = 0 @@ -176,6 +199,20 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: """ evidence = con.execute(query).df().rename(columns=str.lower) + query = f""" + SELECT + TRANSITION_ID, + COUNT(DISTINCT IPF_PEPTIDE_ID) AS N_MAPPED_PEPTIDES, + {has_phospho_loss_sql} + FROM data + WHERE TRANSITION_TYPE != '' + AND TRANSITION_DECOY = 0 + AND SCORE_TRANSITION_SCORE IS NOT NULL + AND IPF_PEPTIDE_ID IS NOT NULL + GROUP BY TRANSITION_ID + """ + transition_meta = con.execute(query).df().rename(columns=str.lower) + # Bitmask table: transition-peptidoform presence query = """ SELECT DISTINCT TRANSITION_ID, IPF_PEPTIDE_ID AS PEPTIDE_ID, 1 AS BMASK @@ -223,6 +260,7 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: # Merge all parts into final dataframe trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) diff --git a/pyprophet/io/ipf/split_parquet.py b/pyprophet/io/ipf/split_parquet.py index 085cdea..93acab5 100644 --- a/pyprophet/io/ipf/split_parquet.py +++ b/pyprophet/io/ipf/split_parquet.py @@ -63,6 +63,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 logger.info("Reading precursor-level data ...") @@ -81,6 +82,16 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: all_precursor_cols = get_parquet_column_names(precursor_files[0]) all_transition_cols = get_parquet_column_names(transition_files[0]) + intensity_select = ( + ",\n p.FEATURE_MS2_AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + + if add_intensity and "FEATURE_MS2_AREA_INTENSITY" not in all_precursor_cols: + raise click.ClickException( + "FEATURE_MS2_AREA_INTENSITY is required for peakgroup-intensity IPF filtering." + ) # con.execute( # f"CREATE VIEW precursors AS SELECT * FROM read_parquet({precursor_files})" @@ -98,7 +109,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: raise click.ClickException("Apply MS2 + transition scoring before IPF.") query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - NULL AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + NULL AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p INNER JOIN ( SELECT FEATURE_ID, SCORE_TRANSITION_PEP @@ -115,7 +126,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: raise click.ClickException("Apply MS1 + MS2 scoring before IPF.") query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP + p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p WHERE p.PRECURSOR_DECOY = 0 AND p.SCORE_MS2_PEP < {pep_threshold}; """ @@ -133,7 +144,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: ) query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p INNER JOIN ( SELECT FEATURE_ID, SCORE_TRANSITION_PEP @@ -152,7 +163,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: raise click.ClickException("Apply MS2 + transition scoring before IPF.") query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - NULL AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP + NULL AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p WHERE p.PRECURSOR_DECOY = 0 AND p.SCORE_MS2_PEP < {pep_threshold}; """ @@ -188,6 +199,20 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: raise click.ClickException( "IPF_PEPTIDE_ID column is required in transition features." ) + has_annotation = "ANNOTATION" in all_transition_cols + has_overlap_col = ( + "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE" in all_transition_cols + ) + has_phospho_loss_sql = ( + "MAX(CASE WHEN COALESCE(t.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END) AS HAS_PHOSPHO_LOSS" + if has_annotation + else "0 AS HAS_PHOSPHO_LOSS" + ) + overlap_select = ( + ", t.FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" + if has_overlap_col + else "" + ) # Load all transition files into DuckDB view con.execute( @@ -196,7 +221,7 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: # Evidence table: transition-level PEPs query = f""" - SELECT t.FEATURE_ID, t.TRANSITION_ID, t.SCORE_TRANSITION_PEP AS PEP + SELECT t.FEATURE_ID, t.TRANSITION_ID, t.SCORE_TRANSITION_PEP AS PEP{overlap_select} FROM transition t WHERE t.TRANSITION_TYPE != '' AND t.TRANSITION_DECOY = 0 @@ -205,6 +230,20 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: """ evidence = con.execute(query).df().rename(columns=str.lower) + query = f""" + SELECT + t.TRANSITION_ID, + COUNT(DISTINCT t.IPF_PEPTIDE_ID) AS N_MAPPED_PEPTIDES, + {has_phospho_loss_sql} + FROM transition t + WHERE t.TRANSITION_TYPE != '' + AND t.TRANSITION_DECOY = 0 + AND t.SCORE_TRANSITION_SCORE IS NOT NULL + AND t.IPF_PEPTIDE_ID IS NOT NULL + GROUP BY t.TRANSITION_ID + """ + transition_meta = con.execute(query).df().rename(columns=str.lower) + # Bitmask table: transition-peptidoform presence query = """ SELECT DISTINCT t.TRANSITION_ID, t.IPF_PEPTIDE_ID AS PEPTIDE_ID, 1 AS BMASK @@ -250,6 +289,7 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: # Merge all parts into final dataframe trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) diff --git a/pyprophet/ipf.py b/pyprophet/ipf.py index d91f301..c02f2d7 100644 --- a/pyprophet/ipf.py +++ b/pyprophet/ipf.py @@ -30,6 +30,7 @@ import numpy as np import pandas as pd from loguru import logger +from scipy.special import logsumexp from scipy.stats import rankdata from ._config import IPFIOConfig @@ -63,6 +64,453 @@ def compute_model_fdr(data_in): return fdr +def compute_grouped_model_fdr(data_in, group_keys, log_prefix="Grouped FDR"): + """ + Compute model-based FDR estimates independently within each group. + + Args: + data_in (array-like): Input posterior error probabilities. + group_keys (array-like): Group label per row in ``data_in``. + log_prefix (str): Prefix used for group-count logging. + + Returns: + np.ndarray: FDR estimates for the input data, grouped by ``group_keys``. + """ + data = np.asarray(data_in, dtype=float) + groups = pd.Series(group_keys).fillna("NA").astype(str) + if len(groups) != len(data): + raise ValueError("group_keys must have the same length as data_in.") + + qvalues = np.empty(len(data), dtype=float) + counts = groups.value_counts().sort_index() + logger.info( + f"{log_prefix}: " + + ", ".join(f"{group}={count}" for group, count in counts.items()) + ) + + for group, idx in groups.groupby(groups).groups.items(): + idx_arr = np.fromiter(idx, dtype=int) + qvalues[idx_arr] = compute_model_fdr(data[idx_arr]) + + return qvalues + + +def compute_ipf_qvalues( + pf_pp_data, grouped_fdr=False, grouped_fdr_strategy="num_peptidoforms" +): + """ + Compute IPF q-values using pooled or grouped model-based FDR. + + Grouped FDR groups rows according to ``grouped_fdr_strategy``. + """ + if not grouped_fdr: + return compute_model_fdr(pf_pp_data["pep"]) + + if grouped_fdr_strategy == "num_peptidoforms": + if "num_peptidoforms" not in pf_pp_data.columns: + raise ValueError("num_peptidoforms is required for grouped FDR.") + return compute_grouped_model_fdr( + pf_pp_data["pep"], + pf_pp_data["num_peptidoforms"].fillna(-1).astype(int), + log_prefix="Grouped FDR by num_peptidoforms", + ) + + raise ValueError( + f"Unsupported grouped FDR strategy: {grouped_fdr_strategy!r}" + ) + + +def compute_post_ipf_filter_metrics(transition_table, precursor_table): + """ + Computes per-feature / per-hypothesis metrics used for optional post-IPF filtering. + + Args: + transition_table (pd.DataFrame): Transition-level peptidoform table before Bayesian modeling. + precursor_table (pd.DataFrame): Peakgroup / precursor-level table, optionally including feature_ms2_intensity. + + Returns: + pd.DataFrame: One row per feature_id + peptide_id with supporting transition + counts and optional feature_ms2_intensity. + """ + hypotheses = transition_table.loc[ + transition_table["peptide_id"] != -1, ["feature_id", "peptide_id"] + ].drop_duplicates() + + supporting_cols = ["feature_id", "peptide_id", "transition_id"] + supporting_cols.extend( + [ + col + for col in [ + "n_mapped_peptides", + "has_phospho_loss", + "isotope_overlap_score", + ] + if col in transition_table.columns + ] + ) + supporting_rows = transition_table.loc[ + (transition_table["peptide_id"] != -1) & (transition_table["bmask"] == 1), + supporting_cols, + ].drop_duplicates() + + supporting = ( + supporting_rows.groupby(["feature_id", "peptide_id"], as_index=False)[ + "transition_id" + ] + .nunique() + .rename(columns={"transition_id": "supporting_transitions"}) + ) + + if "n_mapped_peptides" in supporting_rows.columns: + unique_supporting = ( + supporting_rows.loc[supporting_rows["n_mapped_peptides"] == 1] + .groupby(["feature_id", "peptide_id"], as_index=False)["transition_id"] + .nunique() + .rename(columns={"transition_id": "unique_supporting_transitions"}) + ) + else: + unique_supporting = pd.DataFrame( + columns=["feature_id", "peptide_id", "unique_supporting_transitions"] + ) + + if "has_phospho_loss" in supporting_rows.columns: + phospho_loss_supporting = ( + supporting_rows.loc[supporting_rows["has_phospho_loss"] == 1] + .groupby(["feature_id", "peptide_id"], as_index=False)["transition_id"] + .nunique() + .rename( + columns={"transition_id": "phospho_loss_supporting_transitions"} + ) + ) + else: + phospho_loss_supporting = pd.DataFrame( + columns=[ + "feature_id", + "peptide_id", + "phospho_loss_supporting_transitions", + ] + ) + + if "isotope_overlap_score" in supporting_rows.columns: + supporting_overlap = ( + supporting_rows.groupby(["feature_id", "peptide_id"], as_index=False)[ + "isotope_overlap_score" + ] + .median() + .rename( + columns={ + "isotope_overlap_score": "median_supporting_isotope_overlap" + } + ) + ) + else: + supporting_overlap = pd.DataFrame( + columns=[ + "feature_id", + "peptide_id", + "median_supporting_isotope_overlap", + ] + ) + + metrics = ( + hypotheses.merge(supporting, on=["feature_id", "peptide_id"], how="left") + .merge(unique_supporting, on=["feature_id", "peptide_id"], how="left") + .merge( + phospho_loss_supporting, + on=["feature_id", "peptide_id"], + how="left", + ) + .merge(supporting_overlap, on=["feature_id", "peptide_id"], how="left") + .fillna( + { + "supporting_transitions": 0, + "unique_supporting_transitions": 0, + "phospho_loss_supporting_transitions": 0, + } + ) + ) + metrics["supporting_transitions"] = metrics["supporting_transitions"].astype(int) + metrics["unique_supporting_transitions"] = metrics[ + "unique_supporting_transitions" + ].astype(int) + metrics["phospho_loss_supporting_transitions"] = metrics[ + "phospho_loss_supporting_transitions" + ].astype(int) + + if "feature_ms2_intensity" in precursor_table.columns: + metrics = metrics.merge( + precursor_table[["feature_id", "feature_ms2_intensity"]].drop_duplicates(), + on="feature_id", + how="left", + ) + + return metrics + + +def prepare_post_ipf_filter_metrics( + transition_table, + precursor_table, + propagate_signal_across_runs=False, + across_run_confidence_threshold=0.5, +): + """ + Prepares post-IPF filter metrics from the same transition evidence state used by IPF. + + When across-run propagation is enabled, supporting-transition counts are computed on the + propagated evidence table so the post-IPF filter matches the final inference behavior. + + Args: + transition_table (pd.DataFrame): Transition-level peptidoform table. + precursor_table (pd.DataFrame): Peakgroup / precursor-level table. + propagate_signal_across_runs (bool): Whether IPF propagates evidence across runs. + across_run_confidence_threshold (float): Confidence threshold for signal propagation. + + Returns: + pd.DataFrame: Metrics from compute_post_ipf_filter_metrics(). + """ + filter_transition_table = transition_table.copy() + + if propagate_signal_across_runs: + non_prop_data = filter_transition_table.loc[ + filter_transition_table["feature_id"] + == filter_transition_table["alignment_group_id"] + ] + prop_data = filter_transition_table.loc[ + filter_transition_table["feature_id"] + != filter_transition_table["alignment_group_id"] + ] + + if len(prop_data) > 0: + group_cols = [ + "feature_id", + "transition_id", + "peptide_id", + "bmask", + "num_peptidoforms", + "alignment_group_id", + ] + group_cols.extend( + [ + col + for col in [ + "n_mapped_peptides", + "has_phospho_loss", + "isotope_overlap_score", + ] + if col in filter_transition_table.columns + ] + ) + propagated_data = ( + prop_data.groupby("alignment_group_id", group_keys=False) + .apply( + lambda df: transfer_confident_evidence_across_runs( + df, + across_run_confidence_threshold, + group_cols=group_cols, + value_cols=["pep"], + ) + ) + .reset_index(drop=True) + ) + filter_transition_table = pd.concat( + [non_prop_data, propagated_data], ignore_index=True + ) + else: + filter_transition_table = non_prop_data.copy() + + return compute_post_ipf_filter_metrics(filter_transition_table, precursor_table) + + +def _ipf_filters_active( + min_supporting_transitions=0, + min_unique_supporting_transitions=0, + require_phospho_loss_below_support=0, + min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity=0.0, +): + return ( + min_supporting_transitions > 0 + or min_unique_supporting_transitions > 0 + or require_phospho_loss_below_support > 0 + or min_peakgroup_intensity > 0 + or conditional_min_peakgroup_intensity > 0 + ) + + +def _apply_ipf_filter_thresholds( + metrics, + min_supporting_transitions=0, + min_unique_supporting_transitions=0, + require_phospho_loss_below_support=0, + min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=0, + conditional_min_peakgroup_intensity_no_phospho_loss_only=False, + log_prefix="Applied IPF", +): + filtered = metrics.copy() + + for metric_col in [ + "supporting_transitions", + "unique_supporting_transitions", + "phospho_loss_supporting_transitions", + ]: + if metric_col not in filtered.columns: + filtered[metric_col] = 0 + filtered[metric_col] = filtered[metric_col].fillna(0).astype(int) + + if min_supporting_transitions > 0: + before = len(filtered) + filtered = filtered[ + filtered["supporting_transitions"] >= min_supporting_transitions + ].copy() + logger.info( + f"{log_prefix} supporting-transition filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with supporting_transitions >= {min_supporting_transitions}." + ) + + if min_unique_supporting_transitions > 0: + before = len(filtered) + filtered = filtered[ + filtered["unique_supporting_transitions"] >= min_unique_supporting_transitions + ].copy() + logger.info( + f"{log_prefix} unique-supporting-transition filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with unique_supporting_transitions >= {min_unique_supporting_transitions}." + ) + + if require_phospho_loss_below_support > 0: + before = len(filtered) + filtered = filtered[ + (filtered["supporting_transitions"] >= require_phospho_loss_below_support) + | (filtered["phospho_loss_supporting_transitions"] > 0) + ].copy() + logger.info( + f"{log_prefix} phospho-loss rescue filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with supporting_transitions >= {require_phospho_loss_below_support} " + "or at least one phospho-loss supporting transition." + ) + + if min_peakgroup_intensity > 0: + if "feature_ms2_intensity" not in filtered.columns: + raise ValueError( + "feature_ms2_intensity is required for ipf_min_peakgroup_intensity filtering." + ) + before = len(filtered) + filtered = filtered[ + filtered["feature_ms2_intensity"] >= min_peakgroup_intensity + ].copy() + logger.info( + f"{log_prefix} peakgroup-intensity filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with feature_ms2_intensity >= {min_peakgroup_intensity}." + ) + + if conditional_min_peakgroup_intensity > 0: + if conditional_min_peakgroup_intensity_max_supporting_transitions <= 0: + raise ValueError( + "ipf_conditional_min_peakgroup_intensity_max_supporting_transitions must be > 0 " + "when ipf_conditional_min_peakgroup_intensity is enabled." + ) + if "feature_ms2_intensity" not in filtered.columns: + raise ValueError( + "feature_ms2_intensity is required for ipf_conditional_min_peakgroup_intensity filtering." + ) + + before = len(filtered) + weak_support_mask = ( + filtered["supporting_transitions"] + <= conditional_min_peakgroup_intensity_max_supporting_transitions + ) + if conditional_min_peakgroup_intensity_no_phospho_loss_only: + weak_support_mask = weak_support_mask & ( + filtered["phospho_loss_supporting_transitions"] == 0 + ) + + filtered = filtered[ + (~weak_support_mask) + | ( + filtered["feature_ms2_intensity"] + >= conditional_min_peakgroup_intensity + ) + ].copy() + logger.info( + f"{log_prefix} conditional peakgroup-intensity filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"after requiring feature_ms2_intensity >= {conditional_min_peakgroup_intensity} " + f"for rows with supporting_transitions <= " + f"{conditional_min_peakgroup_intensity_max_supporting_transitions}" + + ( + " and no phospho-loss supporting transitions." + if conditional_min_peakgroup_intensity_no_phospho_loss_only + else "." + ) + ) + + return filtered + + +def apply_post_ipf_filters( + result, + filter_metrics, + min_supporting_transitions=0, + min_unique_supporting_transitions=0, + require_phospho_loss_below_support=0, + min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=0, + conditional_min_peakgroup_intensity_no_phospho_loss_only=False, +): + """ + Applies optional post-IPF filters to inferred peptidoform results. + + Args: + result (pd.DataFrame): Inferred peptidoform results with FEATURE_ID / PEPTIDE_ID. + filter_metrics (pd.DataFrame): Metrics from compute_post_ipf_filter_metrics(). + min_supporting_transitions (int): Minimum supporting transitions required. + min_unique_supporting_transitions (int): Minimum uniquely supporting transitions required. + require_phospho_loss_below_support (int): Require at least one phospho-loss supporting + transition when supporting_transitions is below this threshold. + min_peakgroup_intensity (float): Minimum MS2 feature intensity required. + + Returns: + pd.DataFrame: Filtered peptidoform results. + """ + if not _ipf_filters_active( + min_supporting_transitions=min_supporting_transitions, + min_unique_supporting_transitions=min_unique_supporting_transitions, + require_phospho_loss_below_support=require_phospho_loss_below_support, + min_peakgroup_intensity=min_peakgroup_intensity, + conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, + ): + return result + + merged = result.merge( + filter_metrics, + left_on=["FEATURE_ID", "PEPTIDE_ID"], + right_on=["feature_id", "peptide_id"], + how="left", + ) + merged = _apply_ipf_filter_thresholds( + merged, + min_supporting_transitions=min_supporting_transitions, + min_unique_supporting_transitions=min_unique_supporting_transitions, + require_phospho_loss_below_support=require_phospho_loss_below_support, + min_peakgroup_intensity=min_peakgroup_intensity, + conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, + conditional_min_peakgroup_intensity_max_supporting_transitions=conditional_min_peakgroup_intensity_max_supporting_transitions, + conditional_min_peakgroup_intensity_no_phospho_loss_only=conditional_min_peakgroup_intensity_no_phospho_loss_only, + log_prefix="Applied post-IPF", + ) + + return merged[ + ["FEATURE_ID", "PEPTIDE_ID", "PRECURSOR_PEAKGROUP_PEP", "QVALUE", "PEP"] + ].copy() + + def prepare_precursor_bm(data): """ Prepares Bayesian model data for precursor-level inference. @@ -282,16 +730,72 @@ def prepare_transition_bm( return data -def apply_bm(data): +def apply_bm(data, use_log_space=False, evidence_epsilon=0.0): """ Applies the Bayesian model to compute posterior probabilities. Args: data (pd.DataFrame): Input Bayesian model data. + use_log_space (bool): Whether to compute Bayesian posteriors in log-space. + evidence_epsilon (float): Optional clipping epsilon applied to evidence values + before inference. 0 disables clipping. Returns: pd.DataFrame: Data with posterior probabilities for each hypothesis. """ + if evidence_epsilon < 0 or evidence_epsilon >= 0.5: + raise ValueError("evidence_epsilon must satisfy 0 <= epsilon < 0.5.") + + if evidence_epsilon > 0: + data = data.copy() + data["evidence"] = data["evidence"].clip( + lower=evidence_epsilon, upper=1 - evidence_epsilon + ) + + if use_log_space: + with np.errstate(divide="ignore", invalid="ignore"): + grouped_logs = ( + data.assign(log_evidence=np.log(data["evidence"])) + .groupby(["feature_id", "hypothesis"])["log_evidence"] + .sum() + .reset_index() + ) + grouped_prior = ( + data.groupby(["feature_id", "hypothesis"], as_index=False)["prior"] + .min() + ) + grouped_logs = grouped_logs.merge( + grouped_prior, on=["feature_id", "hypothesis"], how="left" + ) + grouped_logs["log_prior"] = np.log(grouped_logs["prior"]) + grouped_logs["log_likelihood_prior"] = ( + grouped_logs["log_evidence"] + grouped_logs["log_prior"] + ) + grouped_logs["log_likelihood_sum"] = grouped_logs.groupby("feature_id")[ + "log_likelihood_prior" + ].transform(logsumexp) + grouped_logs["posterior"] = np.exp( + grouped_logs["log_likelihood_prior"] + - grouped_logs["log_likelihood_sum"] + ) + grouped_logs["likelihood_prior"] = np.exp( + grouped_logs["log_likelihood_prior"] + ) + grouped_logs["likelihood_sum"] = np.exp( + grouped_logs["log_likelihood_sum"] + ) + + pp_data = grouped_logs[ + [ + "feature_id", + "hypothesis", + "likelihood_prior", + "likelihood_sum", + "posterior", + ] + ] + return pp_data.fillna(value=0) + # compute likelihood * prior per feature & hypothesis # all priors are identical but pandas DF multiplication requires aggregation, so we use min() pp_data = ( @@ -317,6 +821,8 @@ def precursor_inference( ipf_ms2_scoring, ipf_max_precursor_pep, ipf_max_precursor_peakgroup_pep, + use_log_space_bm=False, + bm_evidence_epsilon=0.0, ): """ Conducts precursor-level inference. @@ -327,6 +833,8 @@ def precursor_inference( ipf_ms2_scoring (bool): Whether to use MS2-level scoring. ipf_max_precursor_pep (float): Maximum PEP threshold for precursors. ipf_max_precursor_peakgroup_pep (float): Maximum PEP threshold for peak groups. + use_log_space_bm (bool): Whether to compute Bayesian posteriors in log-space. + bm_evidence_epsilon (float): Optional clipping epsilon applied to BM evidence. Returns: pd.DataFrame: Inferred precursor probabilities. @@ -364,7 +872,11 @@ def precursor_inference( # compute posterior precursor probability logger.info("Conducting precursor-level inference ... ") - prec_pp_data = apply_bm(precursor_data_bm) + prec_pp_data = apply_bm( + precursor_data_bm, + use_log_space=use_log_space_bm, + evidence_epsilon=bm_evidence_epsilon, + ) prec_pp_data["precursor_peakgroup_pep"] = 1 - prec_pp_data["posterior"] inferred_precursors = prec_pp_data[prec_pp_data["hypothesis"]][ @@ -392,6 +904,7 @@ def peptidoform_inference( transition_table, precursor_data, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, propagate_signal_across_runs, across_run_confidence_threshold, ): @@ -402,6 +915,7 @@ def peptidoform_inference( transition_table (pd.DataFrame): Input data containing transition-level information. precursor_data (pd.DataFrame): Precursor-level probabilities. ipf_grouped_fdr (bool): Whether to use grouped FDR estimation. + ipf_grouped_fdr_strategy (str): Grouping strategy used when grouped FDR is enabled. propagate_signal_across_runs (bool): Whether to propagate signal across runs. across_run_confidence_threshold (float): Confidence threshold for propagation. @@ -421,23 +935,18 @@ def peptidoform_inference( pf_pp_data = apply_bm(transition_data_bm) pf_pp_data["pep"] = 1 - pf_pp_data["posterior"] + pf_pp_data = pf_pp_data.merge( + transition_data_bm[["feature_id", "num_peptidoforms"]].drop_duplicates(), + on=["feature_id"], + how="left", + ) # compute model-based FDR - if ipf_grouped_fdr: - pf_pp_data["qvalue"] = ( - pd.merge( - pf_pp_data, - transition_data_bm[ - ["feature_id", "num_peptidoforms"] - ].drop_duplicates(), - on=["feature_id"], - how="inner", - ) - .groupby("num_peptidoforms")["pep"] - .transform(compute_model_fdr) - ) - else: - pf_pp_data["qvalue"] = compute_model_fdr(pf_pp_data["pep"]) + pf_pp_data["qvalue"] = compute_ipf_qvalues( + pf_pp_data, + grouped_fdr=ipf_grouped_fdr, + grouped_fdr_strategy=ipf_grouped_fdr_strategy, + ) # merge precursor-level data with UIS data result = pf_pp_data.merge( @@ -493,10 +1002,24 @@ def infer_peptidoforms(config: IPFIOConfig): peptidoform_table = peptidoform_table.astype({"alignment_group_id": "int64"}) + transition_score_metrics = None + need_transition_score_metrics = ( + config.ipf_min_supporting_transitions > 0 + or config.ipf_min_peakgroup_intensity > 0 + ) + if need_transition_score_metrics: + transition_score_metrics = prepare_post_ipf_filter_metrics( + peptidoform_table, + precursor_table, + propagate_signal_across_runs=config.propagate_signal_across_runs, + across_run_confidence_threshold=config.across_run_confidence_threshold, + ) + peptidoform_data = peptidoform_inference( peptidoform_table, precursor_data, config.ipf_grouped_fdr, + config.ipf_grouped_fdr_strategy, config.propagate_signal_across_runs, config.across_run_confidence_threshold, ) @@ -516,6 +1039,12 @@ def infer_peptidoforms(config: IPFIOConfig): # Convert feature_id to int64 peptidoform_data = peptidoform_data.astype({"FEATURE_ID": "int64"}) + peptidoform_data = apply_post_ipf_filters( + peptidoform_data, + transition_score_metrics, + min_supporting_transitions=config.ipf_min_supporting_transitions, + min_peakgroup_intensity=config.ipf_min_peakgroup_intensity, + ) writer = WriterDispatcher.get_writer(config) writer.save_results(result=peptidoform_data) diff --git a/tests/test_ipf.py b/tests/test_ipf.py index 59332d2..afb4fa4 100644 --- a/tests/test_ipf.py +++ b/tests/test_ipf.py @@ -8,10 +8,12 @@ from numpy.testing import assert_almost_equal from pyprophet.ipf import ( - prepare_precursor_bm, - prepare_transition_bm, apply_bm, + apply_post_ipf_filters, compute_model_fdr, + compute_post_ipf_filter_metrics, + prepare_precursor_bm, + prepare_transition_bm, ) @@ -158,3 +160,206 @@ def test_3(): ] ], ) + + +def test_apply_bm_log_space_matches_raw(): + tin = pd.DataFrame( + { + "feature_id": ["id0"] * 6, + "evidence": [0.9, 0.8, 0.7, 0.1, 0.2, 0.3], + "hypothesis": [1, 1, 1, 2, 2, 2], + "prior": [0.5, 0.5, 0.5, 0.5, 0.5, 0.5], + } + ) + + tout_raw = apply_bm(tin) + tout_log = apply_bm(tin, use_log_space=True) + + assert_frame_equal( + tout_raw[ + [ + "feature_id", + "hypothesis", + "likelihood_prior", + "likelihood_sum", + "posterior", + ] + ].sort_values(["feature_id", "hypothesis"]).reset_index(drop=True), + tout_log[ + [ + "feature_id", + "hypothesis", + "likelihood_prior", + "likelihood_sum", + "posterior", + ] + ].sort_values(["feature_id", "hypothesis"]).reset_index(drop=True), + check_exact=False, + atol=1e-12, + rtol=1e-12, + ) + + +def test_apply_bm_log_space_avoids_raw_underflow(): + n = 1000 + tin = pd.DataFrame( + { + "feature_id": ["id0"] * (2 * n), + "evidence": [0.1] * n + [0.2] * n, + "hypothesis": [1] * n + [2] * n, + "prior": [0.5] * (2 * n), + } + ) + + tout_raw = apply_bm(tin) + tout_log = apply_bm(tin, use_log_space=True) + + assert tout_raw["posterior"].sum() == 0 + assert np.isclose(tout_log["posterior"].sum(), 1.0) + assert tout_log.loc[tout_log["hypothesis"] == 2, "posterior"].iloc[0] > 0.999999 + + +def test_apply_bm_evidence_epsilon_clips_extreme_posteriors(): + tin = pd.DataFrame( + { + "feature_id": ["id0"] * 4, + "evidence": [1.0, 1.0, 0.0, 0.0], + "hypothesis": [1, 1, 2, 2], + "prior": [0.5, 0.5, 0.5, 0.5], + } + ) + + tout = apply_bm(tin, evidence_epsilon=1e-6) + + posterior_1 = tout.loc[tout["hypothesis"] == 1, "posterior"].iloc[0] + posterior_2 = tout.loc[tout["hypothesis"] == 2, "posterior"].iloc[0] + + assert 0 < posterior_1 < 1 + assert 0 < posterior_2 < 1 + assert np.isclose(posterior_1 + posterior_2, 1.0) + + +def test_compute_post_ipf_filter_metrics(): + transition_table = pd.DataFrame( + { + "feature_id": [1, 1, 1, 2, 2, 3], + "peptide_id": [10, 10, 11, 20, -1, 30], + "transition_id": [100, 101, 102, 200, 201, 300], + "bmask": [1, 0, 1, 1, 1, 1], + "n_mapped_peptides": [1, 2, 1, 2, 2, 1], + "has_phospho_loss": [1, 0, 0, 1, 0, 0], + "isotope_overlap_score": [0.01, 0.05, 0.02, 0.03, 0.04, 0.01], + } + ) + precursor_table = pd.DataFrame( + { + "feature_id": [1, 2, 3], + "feature_ms2_intensity": [1000.0, 200.0, 50.0], + } + ) + + tout = compute_post_ipf_filter_metrics(transition_table, precursor_table) + tref = pd.DataFrame( + { + "feature_id": [1, 1, 2, 3], + "peptide_id": [10, 11, 20, 30], + "supporting_transitions": [1, 1, 1, 1], + "unique_supporting_transitions": [1, 1, 0, 1], + "phospho_loss_supporting_transitions": [1, 0, 1, 0], + "median_supporting_isotope_overlap": [0.01, 0.02, 0.03, 0.01], + "feature_ms2_intensity": [1000.0, 1000.0, 200.0, 50.0], + } + ) + + assert_frame_equal( + tout.sort_values(["feature_id", "peptide_id"]).reset_index(drop=True), + tref.sort_values(["feature_id", "peptide_id"]).reset_index(drop=True), + ) + +def test_apply_post_ipf_filters(): + result = pd.DataFrame( + { + "FEATURE_ID": [1, 1, 2, 3, 4], + "PEPTIDE_ID": [10, 11, 20, 30, 40], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1, 0.2, 0.3, 0.4], + "QVALUE": [0.01, 0.02, 0.03, 0.04, 0.05], + "PEP": [0.001, 0.002, 0.003, 0.004, 0.005], + } + ) + filter_metrics = pd.DataFrame( + { + "feature_id": [1, 1, 2, 3, 4], + "peptide_id": [10, 11, 20, 30, 40], + "supporting_transitions": [1, 2, 2, 3, 1], + "unique_supporting_transitions": [1, 2, 0, 3, 1], + "phospho_loss_supporting_transitions": [1, 0, 1, 0, 0], + "feature_ms2_intensity": [1000.0, 1000.0, 200.0, 50.0, 1000.0], + } + ) + + tout = apply_post_ipf_filters( + result, + filter_metrics, + min_supporting_transitions=1, + min_unique_supporting_transitions=1, + require_phospho_loss_below_support=2, + min_peakgroup_intensity=100.0, + ) + tref = pd.DataFrame( + { + "FEATURE_ID": [1, 1], + "PEPTIDE_ID": [10, 11], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1], + "QVALUE": [0.01, 0.02], + "PEP": [0.001, 0.002], + } + ) + + assert_frame_equal( + tout.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + tref.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + ) + + +def test_apply_post_ipf_filters_conditional_intensity(): + result = pd.DataFrame( + { + "FEATURE_ID": [1, 2, 3, 4], + "PEPTIDE_ID": [10, 20, 30, 40], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1, 0.1, 0.1], + "QVALUE": [0.01, 0.02, 0.03, 0.04], + "PEP": [0.001, 0.002, 0.003, 0.004], + } + ) + filter_metrics = pd.DataFrame( + { + "feature_id": [1, 2, 3, 4], + "peptide_id": [10, 20, 30, 40], + "supporting_transitions": [1, 1, 2, 3], + "unique_supporting_transitions": [1, 1, 2, 3], + "phospho_loss_supporting_transitions": [0, 1, 0, 0], + "feature_ms2_intensity": [5000.0, 5000.0, 5000.0, 5000.0], + } + ) + + tout = apply_post_ipf_filters( + result, + filter_metrics, + conditional_min_peakgroup_intensity=10000.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=2, + conditional_min_peakgroup_intensity_no_phospho_loss_only=True, + ) + tref = pd.DataFrame( + { + "FEATURE_ID": [2, 4], + "PEPTIDE_ID": [20, 40], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1], + "QVALUE": [0.02, 0.04], + "PEP": [0.002, 0.004], + } + ) + + assert_frame_equal( + tout.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + tref.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + ) diff --git a/tests/test_pyprophet_ipf.py b/tests/test_pyprophet_ipf.py index 548a6a6..206c276 100644 --- a/tests/test_pyprophet_ipf.py +++ b/tests/test_pyprophet_ipf.py @@ -132,7 +132,7 @@ def validate_output( for id_col in ["feature_id", "transition_id", "peptide_id"]: if id_col in transition_data.columns: transition_data[id_col] = transition_data[id_col].astype("Int64") - sort_cols = [ + core_transition_cols = [ "feature_id", "transition_id", "pep", @@ -140,9 +140,12 @@ def validate_output( "bmask", "num_peptidoforms", ] + transition_data = transition_data[core_transition_cols] print("Transition Data:", file=regtest) print( - transition_data.sort_values(sort_cols).reset_index(drop=True).head(100), + transition_data.sort_values(core_transition_cols) + .reset_index(drop=True) + .head(100), file=regtest, )