From 911143b9334907551147de1eaa5d6ebcf74fa14a Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 17 Jun 2026 18:48:53 -0400 Subject: [PATCH 1/5] feat: add experimental IPF grouped FDR and filtering --- pyprophet/_config.py | 14 + pyprophet/cli/ipf.py | 27 + pyprophet/glyco/glycoform.py | 54 +- pyprophet/io/ipf/osw.py | 184 ++++- pyprophet/io/ipf/parquet.py | 57 +- pyprophet/io/ipf/split_parquet.py | 63 +- pyprophet/ipf.py | 1033 ++++++++++++++++++++++++++++- tests/test_ipf.py | 465 ++++++++++++- 8 files changed, 1843 insertions(+), 54 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 81e11347..0296b78f 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -456,10 +456,13 @@ class IPFIOConfig(BaseIOConfig): ipf_ms2_scoring (bool): Use MS2 precursor data for IPF. ipf_h0 (bool): Include possibility that peak groups are not covered by the peptidoform space (null hypothesis H0). ipf_grouped_fdr (bool): [Experimental] Compute grouped FDR instead of pooled FDR to support heterogeneous peptidoform counts per peak group. + ipf_grouped_fdr_strategy (Literal["num_peptidoforms", "support_phospho_loss"]): Grouping strategy used when grouped FDR is enabled. ipf_max_precursor_pep (float): Maximum PEP to consider scored precursors in IPF. ipf_max_peakgroup_pep (float): Maximum PEP to consider scored peak groups in IPF. ipf_max_precursor_peakgroup_pep (float): Maximum BHM layer 1 integrated precursor-peakgroup PEP to consider in IPF. ipf_max_transition_pep (float): Maximum PEP to consider scored transitions in IPF. + ipf_min_supporting_transitions (int): Minimum number of supporting transitions required to keep an inferred peptidoform result. + ipf_min_peakgroup_intensity (float): Minimum MS2 peakgroup area intensity required to keep an inferred peptidoform result. propagate_signal_across_runs (bool): Propagate signal across runs (requires alignment step). ipf_max_alignment_pep (float): Maximum PEP to consider for good alignments. across_run_confidence_threshold (float): Maximum PEP threshold for propagating signal across runs for aligned features. @@ -471,10 +474,15 @@ class IPFIOConfig(BaseIOConfig): ipf_ms2_scoring: bool = True ipf_h0: bool = True ipf_grouped_fdr: bool = False + ipf_grouped_fdr_strategy: Literal[ + "num_peptidoforms", "support_phospho_loss" + ] = "num_peptidoforms" ipf_max_precursor_pep: float = 0.7 ipf_max_peakgroup_pep: float = 0.7 ipf_max_precursor_peakgroup_pep: float = 0.4 ipf_max_transition_pep: float = 0.6 + ipf_min_supporting_transitions: int = 0 + ipf_min_peakgroup_intensity: float = 0.0 propagate_signal_across_runs: bool = False ipf_max_alignment_pep: float = 0.7 across_run_confidence_threshold: float = 0.5 @@ -493,10 +501,13 @@ def from_cli_args( ipf_ms2_scoring, ipf_h0, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, ipf_max_precursor_pep, ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep, + ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity, propagate_signal_across_runs, ipf_max_alignment_pep, across_run_confidence_threshold, @@ -516,10 +527,13 @@ def from_cli_args( ipf_ms2_scoring=ipf_ms2_scoring, ipf_h0=ipf_h0, ipf_grouped_fdr=ipf_grouped_fdr, + ipf_grouped_fdr_strategy=ipf_grouped_fdr_strategy, ipf_max_precursor_pep=ipf_max_precursor_pep, ipf_max_peakgroup_pep=ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep=ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep=ipf_max_transition_pep, + ipf_min_supporting_transitions=ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity=ipf_min_peakgroup_intensity, propagate_signal_across_runs=propagate_signal_across_runs, ipf_max_alignment_pep=ipf_max_alignment_pep, across_run_confidence_threshold=across_run_confidence_threshold, diff --git a/pyprophet/cli/ipf.py b/pyprophet/cli/ipf.py index 3a33651d..d64af7b8 100644 --- a/pyprophet/cli/ipf.py +++ b/pyprophet/cli/ipf.py @@ -48,6 +48,13 @@ show_default=True, help="[Experimental] Compute grouped FDR instead of pooled FDR to better support data where peak groups are evaluated to originate from very heterogeneous numbers of peptidoforms.", ) +@click.option( + "--ipf_grouped_fdr_strategy", + default="num_peptidoforms", + show_default=True, + type=click.Choice(["num_peptidoforms", "support_phospho_loss"]), + help="Grouping strategy used when --ipf_grouped_fdr is enabled. 'num_peptidoforms' reproduces the legacy grouping. 'support_phospho_loss' groups by supporting-transition bin (<=1, 2, >=3) and presence of phospho-loss support.", +) @click.option( "--ipf_max_precursor_pep", default=0.7, @@ -76,6 +83,20 @@ type=float, help="Maximum PEP to consider scored transitions in IPF.", ) +@click.option( + "--ipf_min_supporting_transitions", + default=0, + show_default=True, + type=int, + help="Minimum number of supporting identifying transitions required to keep an inferred peptidoform result. Applied as a post-IPF filter; 0 disables the filter.", +) +@click.option( + "--ipf_min_peakgroup_intensity", + default=0.0, + show_default=True, + type=float, + help="Minimum FEATURE_MS2 area intensity required to keep an inferred peptidoform result. Applied as a post-IPF filter; 0 disables the filter.", +) @click.option( "--propagate_signal_across_runs/--no-propagate_signal_across_runs", default=False, @@ -120,10 +141,13 @@ def ipf( ipf_ms2_scoring, ipf_h0, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, ipf_max_precursor_pep, ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep, + ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity, propagate_signal_across_runs, ipf_max_alignment_pep, across_run_confidence_threshold, @@ -155,10 +179,13 @@ def ipf( ipf_ms2_scoring, ipf_h0, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, ipf_max_precursor_pep, ipf_max_peakgroup_pep, ipf_max_precursor_peakgroup_pep, ipf_max_transition_pep, + ipf_min_supporting_transitions, + ipf_min_peakgroup_intensity, propagate_signal_across_runs, ipf_max_alignment_pep, across_run_confidence_threshold, diff --git a/pyprophet/glyco/glycoform.py b/pyprophet/glyco/glycoform.py index f5fe8323..96223aa3 100644 --- a/pyprophet/glyco/glycoform.py +++ b/pyprophet/glyco/glycoform.py @@ -15,24 +15,54 @@ from .pepmass import GlycoPeptideMassCalculator -def get_feature_mapping_across_runs(infile, ipf_max_alignment_pep=1): +def get_feature_mapping_across_runs(infile, min_confidence=0.5): click.echo("Info: Reading Across Run Feature Alignment Mapping ... ", nl=False) start = time.time() con = sqlite3.connect(infile) + query = """ + SELECT + DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, + ALIGNMENT_ID, + FEATURE_ID, + PRECURSOR_ID, + FEATURE_TYPE + FROM ( + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + REFERENCE_FEATURE_ID AS FEATURE_ID, + 'REFERENCE' AS FEATURE_TYPE + FROM FEATURE_MS2_ALIGNMENT_CANDIDATE + WHERE SELECTED = 1 + AND MAPPING_CONFIDENCE >= ? + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + AND ALIGNED_FEATURE_ID != -1 + + UNION + + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + ALIGNED_FEATURE_ID AS FEATURE_ID, + 'QUERY' AS FEATURE_TYPE + FROM FEATURE_MS2_ALIGNMENT_CANDIDATE + WHERE SELECTED = 1 + AND MAPPING_CONFIDENCE >= ? + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + AND ALIGNED_FEATURE_ID != -1 + ) AS feature_list + ORDER BY + ALIGNMENT_GROUP_ID, + CASE FEATURE_TYPE + WHEN 'REFERENCE' THEN 0 + WHEN 'QUERY' THEN 1 + END + """ + data = pd.read_sql_query( - f"""SELECT - DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, - ALIGNED_FEATURE_ID AS FEATURE_ID - FROM (SELECT DISTINCT * FROM FEATURE_MS2_ALIGNMENT) AS FEATURE_MS2_ALIGNMENT - INNER JOIN - (SELECT DISTINCT *, MIN(QVALUE) FROM SCORE_ALIGNMENT GROUP BY FEATURE_ID) AS SCORE_ALIGNMENT - ON SCORE_ALIGNMENT.FEATURE_ID = FEATURE_MS2_ALIGNMENT.ALIGNED_FEATURE_ID - WHERE LABEL = 1 - AND SCORE_ALIGNMENT.PEP < {ipf_max_alignment_pep} - ORDER BY ALIGNMENT_GROUP_ID""", - con, + query, con, params=[min_confidence, min_confidence] ) data.columns = [col.lower() for col in data.columns] diff --git a/pyprophet/io/ipf/osw.py b/pyprophet/io/ipf/osw.py index 98e570a3..424561bf 100644 --- a/pyprophet/io/ipf/osw.py +++ b/pyprophet/io/ipf/osw.py @@ -7,7 +7,7 @@ import pandas as pd import click from loguru import logger -from ..util import check_sqlite_table, check_duckdb_table +from ..util import check_sqlite_table, check_duckdb_table, get_table_columns from .._base import BaseOSWReader, BaseOSWWriter from ..._config import IPFIOConfig @@ -121,10 +121,26 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 + intensity_select = ( + ",\n FEATURE_MS2.AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + feature_ms2_join = ( + "\n INNER JOIN osw.FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID" + if add_intensity + else "" + ) # precursors are restricted according to ipf_max_peakgroup_pep to exclude very poor peak groups logger.info("Reading precursor-level data ...") + if add_intensity and not check_duckdb_table(con, "main", "FEATURE_MS2"): + raise click.ClickException( + "FEATURE_MS2 is required for peakgroup-intensity IPF filtering." + ) + if not ipf_ms1 and ipf_ms2: # only use MS2 precursors if not check_duckdb_table( con, "main", "SCORE_MS2" @@ -136,9 +152,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( SELECT FEATURE_ID, PEP @@ -161,9 +178,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < {pep_threshold} @@ -183,9 +201,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( @@ -209,9 +228,10 @@ def _read_pyp_peakgroup_precursor_duckdb(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM osw.PRECURSOR INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < {pep_threshold} """ @@ -223,19 +243,69 @@ def _read_pyp_transition_duckdb(self, con): rc = self.config ipf_h0 = rc.ipf_h0 pep_threshold = rc.ipf_max_transition_pep + require_overlap = False + require_phospho_loss = ( + rc.ipf_grouped_fdr + and rc.ipf_grouped_fdr_strategy == "support_phospho_loss" + ) + has_annotation = "ANNOTATION" in get_table_columns(self.infile, "TRANSITION") + + if require_phospho_loss and not has_annotation: + raise click.ClickException( + "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." + ) + phospho_loss_expr = ( + "CASE WHEN COALESCE(TRANSITION.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END" + if has_annotation + else "0" + ) # only the evidence is restricted to ipf_max_transition_pep, the peptidoform-space is complete logger.info("Info: Reading peptidoform-level data ...") + overlap_join = "" + overlap_select = "" + if require_overlap: + if not check_duckdb_table(con, "main", "FEATURE_TRANSITION"): + raise click.ClickException( + "FEATURE_TRANSITION is required for overlap-aware IPF score adjustment." + ) + overlap_join = ( + "INNER JOIN osw.FEATURE_TRANSITION " + "ON SCORE_TRANSITION.FEATURE_ID = FEATURE_TRANSITION.FEATURE_ID " + "AND SCORE_TRANSITION.TRANSITION_ID = FEATURE_TRANSITION.TRANSITION_ID" + ) + overlap_select = ( + ", FEATURE_TRANSITION.VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" + ) + queries = { "evidence": f""" - SELECT FEATURE_ID, TRANSITION_ID, PEP + SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP{overlap_select} FROM osw.SCORE_TRANSITION INNER JOIN osw.TRANSITION ON SCORE_TRANSITION.TRANSITION_ID = TRANSITION.ID + {overlap_join} WHERE TRANSITION.TYPE != '' AND TRANSITION.DECOY = 0 AND PEP < {pep_threshold} """, + "transition_meta": """ + WITH mapped_counts AS ( + SELECT + TRANSITION_ID, + COUNT(DISTINCT PEPTIDE_ID) AS N_MAPPED_PEPTIDES + FROM osw.TRANSITION_PEPTIDE_MAPPING + GROUP BY TRANSITION_ID + ) + SELECT DISTINCT + TRANSITION.ID AS TRANSITION_ID, + COALESCE(mapped_counts.N_MAPPED_PEPTIDES, 0) AS N_MAPPED_PEPTIDES, + {phospho_loss_expr} AS HAS_PHOSPHO_LOSS + FROM osw.TRANSITION + LEFT JOIN mapped_counts ON TRANSITION.ID = mapped_counts.TRANSITION_ID + WHERE TRANSITION.TYPE != '' + AND TRANSITION.DECOY = 0 + """.format(phospho_loss_expr=phospho_loss_expr), "bitmask": """ SELECT DISTINCT TRANSITION.ID AS TRANSITION_ID, PEPTIDE_ID, 1 AS BMASK FROM osw.SCORE_TRANSITION @@ -265,6 +335,9 @@ def _read_pyp_transition_duckdb(self, con): # Execute evidence = con.execute(queries["evidence"]).fetchdf().rename(columns=str.lower) + transition_meta = ( + con.execute(queries["transition_meta"]).fetchdf().rename(columns=str.lower) + ) bitmask = con.execute(queries["bitmask"]).fetchdf().rename(columns=str.lower) num_peptidoforms = ( con.execute(queries["num_peptidoforms"]).fetchdf().rename(columns=str.lower) @@ -290,6 +363,7 @@ def _read_pyp_transition_duckdb(self, con): # Merge trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) @@ -420,6 +494,22 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 + intensity_select = ( + ",\n FEATURE_MS2.AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + feature_ms2_join = ( + "\n INNER JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID" + if add_intensity + else "" + ) + + if add_intensity and not check_sqlite_table(con, "FEATURE_MS2"): + raise click.ClickException( + "FEATURE_MS2 is required for peakgroup-intensity IPF filtering." + ) if not ipf_ms1 and ipf_ms2: # only use MS2 precursors if not check_sqlite_table(con, "SCORE_MS2") or not check_sqlite_table( @@ -433,9 +523,10 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( SELECT FEATURE_ID, PEP @@ -458,9 +549,10 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < ? @@ -479,9 +571,10 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1.PEP AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION.PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS1 ON FEATURE.ID = SCORE_MS1.FEATURE_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN ( @@ -498,33 +591,90 @@ def _read_pyp_peakgroup_precursor_sqlite(self, con): SELECT FEATURE.ID AS FEATURE_ID, SCORE_MS2.PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM PRECURSOR INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID + {feature_ms2_join} INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE PRECURSOR.DECOY=0 AND SCORE_MS2.PEP < ? """ - df = pd.read_sql_query(query, con, params=[pep_threshold]) + df = pd.read_sql_query( + query.format( + intensity_select=intensity_select, feature_ms2_join=feature_ms2_join + ), + con, + params=[pep_threshold], + ) return df.rename(columns=str.lower) def _read_pyp_transition_sqlite(self, con): rc = self.config ipf_h0 = rc.ipf_h0 pep_threshold = rc.ipf_max_transition_pep + require_overlap = False + require_phospho_loss = ( + rc.ipf_grouped_fdr + and rc.ipf_grouped_fdr_strategy == "support_phospho_loss" + ) + has_annotation = "ANNOTATION" in get_table_columns(self.infile, "TRANSITION") + + if require_phospho_loss and not has_annotation: + raise click.ClickException( + "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." + ) + phospho_loss_expr = ( + "CASE WHEN COALESCE(TRANSITION.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END" + if has_annotation + else "0" + ) # only the evidence is restricted to ipf_max_transition_pep, the peptidoform-space is complete logger.info("Info: Reading peptidoform-level data ...") + overlap_join = "" + overlap_select = "" + if require_overlap: + if not check_sqlite_table(con, "FEATURE_TRANSITION"): + raise click.ClickException( + "FEATURE_TRANSITION is required for overlap-aware IPF score adjustment." + ) + overlap_join = ( + "INNER JOIN FEATURE_TRANSITION " + "ON SCORE_TRANSITION.FEATURE_ID = FEATURE_TRANSITION.FEATURE_ID " + "AND SCORE_TRANSITION.TRANSITION_ID = FEATURE_TRANSITION.TRANSITION_ID" + ) + overlap_select = ( + ", FEATURE_TRANSITION.VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" + ) + queries = { "evidence": """ - SELECT FEATURE_ID, TRANSITION_ID, PEP + SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP{overlap_select} FROM SCORE_TRANSITION INNER JOIN TRANSITION ON SCORE_TRANSITION.TRANSITION_ID = TRANSITION.ID + {overlap_join} WHERE TRANSITION.TYPE != '' AND TRANSITION.DECOY = 0 AND PEP < ? """, + "transition_meta": """ + WITH mapped_counts AS ( + SELECT + TRANSITION_ID, + COUNT(DISTINCT PEPTIDE_ID) AS N_MAPPED_PEPTIDES + FROM TRANSITION_PEPTIDE_MAPPING + GROUP BY TRANSITION_ID + ) + SELECT DISTINCT + TRANSITION.ID AS TRANSITION_ID, + COALESCE(mapped_counts.N_MAPPED_PEPTIDES, 0) AS N_MAPPED_PEPTIDES, + {phospho_loss_expr} AS HAS_PHOSPHO_LOSS + FROM TRANSITION + LEFT JOIN mapped_counts ON TRANSITION.ID = mapped_counts.TRANSITION_ID + WHERE TRANSITION.TYPE != '' + AND TRANSITION.DECOY = 0 + """.format(phospho_loss_expr=phospho_loss_expr), "bitmask": """ SELECT DISTINCT TRANSITION.ID AS TRANSITION_ID, PEPTIDE_ID, 1 AS BMASK FROM SCORE_TRANSITION @@ -554,7 +704,14 @@ def _read_pyp_transition_sqlite(self, con): # Execute evidence = pd.read_sql_query( - queries["evidence"], con, params=[pep_threshold] + queries["evidence"].format( + overlap_select=overlap_select, overlap_join=overlap_join + ), + con, + params=[pep_threshold], + ).rename(columns=str.lower) + transition_meta = pd.read_sql_query( + queries["transition_meta"], con ).rename(columns=str.lower) bitmask = pd.read_sql_query(queries["bitmask"], con).rename(columns=str.lower) num_peptidoforms = pd.read_sql_query(queries["num_peptidoforms"], con).rename( @@ -581,6 +738,7 @@ def _read_pyp_transition_sqlite(self, con): # Merge trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) diff --git a/pyprophet/io/ipf/parquet.py b/pyprophet/io/ipf/parquet.py index f97e6bde..a00ec449 100644 --- a/pyprophet/io/ipf/parquet.py +++ b/pyprophet/io/ipf/parquet.py @@ -71,10 +71,21 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 logger.info("Reading precursor-level data ...") all_cols = get_parquet_column_names(self.infile) + intensity_select = ( + ",\n FEATURE_MS2_AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + + if add_intensity and "FEATURE_MS2_AREA_INTENSITY" not in all_cols: + raise click.ClickException( + "FEATURE_MS2_AREA_INTENSITY is required for peakgroup-intensity IPF filtering." + ) if not ipf_ms1 and ipf_ms2: if not any(c.startswith("SCORE_MS2_") for c in all_cols) or not any( @@ -85,7 +96,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND TRANSITION_TYPE = '' @@ -102,7 +113,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND SCORE_MS2_PEP < {pep_threshold} @@ -123,7 +134,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, - SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND TRANSITION_TYPE = '' @@ -140,7 +151,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: SELECT FEATURE_ID, SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, NULL AS MS1_PRECURSOR_PEP, - NULL AS MS2_PRECURSOR_PEP + NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM data WHERE PRECURSOR_DECOY = 0 AND SCORE_MS2_PEP < {pep_threshold} @@ -153,6 +164,11 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: cfg = self.config ipf_h0 = cfg.ipf_h0 pep_threshold = cfg.ipf_max_transition_pep + require_phospho_loss = ( + cfg.ipf_grouped_fdr + and cfg.ipf_grouped_fdr_strategy == "support_phospho_loss" + ) + require_overlap = False logger.info("Reading peptidoform-level data ...") @@ -162,12 +178,28 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: raise click.ClickException( "IPF_PEPTIDE_ID column is required in transition features." ) + has_annotation = "ANNOTATION" in all_cols + if require_phospho_loss and not has_annotation: + raise click.ClickException( + "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." + ) + has_overlap_col = "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE" in all_cols + has_phospho_loss_sql = ( + "MAX(CASE WHEN COALESCE(ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END) AS HAS_PHOSPHO_LOSS" + if has_annotation + else "0 AS HAS_PHOSPHO_LOSS" + ) + overlap_select = ( + ", FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" + if has_overlap_col + else "" + ) # Use DuckDB view `data` created in _init_duckdb_views() # Evidence table: transition-level PEPs query = f""" - SELECT FEATURE_ID, TRANSITION_ID, SCORE_TRANSITION_PEP AS PEP + SELECT FEATURE_ID, TRANSITION_ID, SCORE_TRANSITION_PEP AS PEP{overlap_select} FROM data WHERE TRANSITION_TYPE != '' AND TRANSITION_DECOY = 0 @@ -176,6 +208,20 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: """ evidence = con.execute(query).df().rename(columns=str.lower) + query = f""" + SELECT + TRANSITION_ID, + COUNT(DISTINCT IPF_PEPTIDE_ID) AS N_MAPPED_PEPTIDES, + {has_phospho_loss_sql} + FROM data + WHERE TRANSITION_TYPE != '' + AND TRANSITION_DECOY = 0 + AND SCORE_TRANSITION_SCORE IS NOT NULL + AND IPF_PEPTIDE_ID IS NOT NULL + GROUP BY TRANSITION_ID + """ + transition_meta = con.execute(query).df().rename(columns=str.lower) + # Bitmask table: transition-peptidoform presence query = """ SELECT DISTINCT TRANSITION_ID, IPF_PEPTIDE_ID AS PEPTIDE_ID, 1 AS BMASK @@ -223,6 +269,7 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: # Merge all parts into final dataframe trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) diff --git a/pyprophet/io/ipf/split_parquet.py b/pyprophet/io/ipf/split_parquet.py index 085cdea0..a93a91a5 100644 --- a/pyprophet/io/ipf/split_parquet.py +++ b/pyprophet/io/ipf/split_parquet.py @@ -63,6 +63,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: ipf_ms1 = cfg.ipf_ms1_scoring ipf_ms2 = cfg.ipf_ms2_scoring pep_threshold = cfg.ipf_max_peakgroup_pep + add_intensity = cfg.ipf_min_peakgroup_intensity > 0 logger.info("Reading precursor-level data ...") @@ -81,6 +82,16 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: all_precursor_cols = get_parquet_column_names(precursor_files[0]) all_transition_cols = get_parquet_column_names(transition_files[0]) + intensity_select = ( + ",\n p.FEATURE_MS2_AREA_INTENSITY AS FEATURE_MS2_INTENSITY" + if add_intensity + else "" + ) + + if add_intensity and "FEATURE_MS2_AREA_INTENSITY" not in all_precursor_cols: + raise click.ClickException( + "FEATURE_MS2_AREA_INTENSITY is required for peakgroup-intensity IPF filtering." + ) # con.execute( # f"CREATE VIEW precursors AS SELECT * FROM read_parquet({precursor_files})" @@ -98,7 +109,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: raise click.ClickException("Apply MS2 + transition scoring before IPF.") query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - NULL AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + NULL AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p INNER JOIN ( SELECT FEATURE_ID, SCORE_TRANSITION_PEP @@ -115,7 +126,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: raise click.ClickException("Apply MS1 + MS2 scoring before IPF.") query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP + p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p WHERE p.PRECURSOR_DECOY = 0 AND p.SCORE_MS2_PEP < {pep_threshold}; """ @@ -133,7 +144,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: ) query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP + p.SCORE_MS1_PEP AS MS1_PRECURSOR_PEP, t.SCORE_TRANSITION_PEP AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p INNER JOIN ( SELECT FEATURE_ID, SCORE_TRANSITION_PEP @@ -152,7 +163,7 @@ def _read_pyp_peakgroup_precursor(self, con) -> pd.DataFrame: raise click.ClickException("Apply MS2 + transition scoring before IPF.") query = f""" SELECT p.FEATURE_ID, p.SCORE_MS2_PEP AS MS2_PEAKGROUP_PEP, - NULL AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP + NULL AS MS1_PRECURSOR_PEP, NULL AS MS2_PRECURSOR_PEP{intensity_select} FROM precursors p WHERE p.PRECURSOR_DECOY = 0 AND p.SCORE_MS2_PEP < {pep_threshold}; """ @@ -165,6 +176,11 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: cfg = self.config ipf_h0 = cfg.ipf_h0 pep_threshold = cfg.ipf_max_transition_pep + require_phospho_loss = ( + cfg.ipf_grouped_fdr + and cfg.ipf_grouped_fdr_strategy == "support_phospho_loss" + ) + require_overlap = False logger.info("Reading peptidoform-level data ...") @@ -188,6 +204,28 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: raise click.ClickException( "IPF_PEPTIDE_ID column is required in transition features." ) + has_annotation = "ANNOTATION" in all_transition_cols + if require_phospho_loss and not has_annotation: + raise click.ClickException( + "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." + ) + has_overlap_col = ( + "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE" in all_transition_cols + ) + if require_overlap and not has_overlap_col: + raise click.ClickException( + "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE is required for overlap-aware IPF score adjustment." + ) + has_phospho_loss_sql = ( + "MAX(CASE WHEN COALESCE(t.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END) AS HAS_PHOSPHO_LOSS" + if has_annotation + else "0 AS HAS_PHOSPHO_LOSS" + ) + overlap_select = ( + ", t.FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" + if has_overlap_col + else "" + ) # Load all transition files into DuckDB view con.execute( @@ -196,7 +234,7 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: # Evidence table: transition-level PEPs query = f""" - SELECT t.FEATURE_ID, t.TRANSITION_ID, t.SCORE_TRANSITION_PEP AS PEP + SELECT t.FEATURE_ID, t.TRANSITION_ID, t.SCORE_TRANSITION_PEP AS PEP{overlap_select} FROM transition t WHERE t.TRANSITION_TYPE != '' AND t.TRANSITION_DECOY = 0 @@ -205,6 +243,20 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: """ evidence = con.execute(query).df().rename(columns=str.lower) + query = f""" + SELECT + t.TRANSITION_ID, + COUNT(DISTINCT t.IPF_PEPTIDE_ID) AS N_MAPPED_PEPTIDES, + {has_phospho_loss_sql} + FROM transition t + WHERE t.TRANSITION_TYPE != '' + AND t.TRANSITION_DECOY = 0 + AND t.SCORE_TRANSITION_SCORE IS NOT NULL + AND t.IPF_PEPTIDE_ID IS NOT NULL + GROUP BY t.TRANSITION_ID + """ + transition_meta = con.execute(query).df().rename(columns=str.lower) + # Bitmask table: transition-peptidoform presence query = """ SELECT DISTINCT t.TRANSITION_ID, t.IPF_PEPTIDE_ID AS PEPTIDE_ID, 1 AS BMASK @@ -250,6 +302,7 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: # Merge all parts into final dataframe trans_pf = pd.merge(evidence, peptidoforms, how="outer", on="feature_id") + trans_pf = pd.merge(trans_pf, transition_meta, how="left", on="transition_id") trans_pf_bm = pd.merge( trans_pf, bitmask, how="left", on=["transition_id", "peptide_id"] ).fillna(0) diff --git a/pyprophet/ipf.py b/pyprophet/ipf.py index d91f3014..62d5c2c5 100644 --- a/pyprophet/ipf.py +++ b/pyprophet/ipf.py @@ -30,6 +30,7 @@ import numpy as np import pandas as pd from loguru import logger +from scipy.special import expit, logsumexp from scipy.stats import rankdata from ._config import IPFIOConfig @@ -63,6 +64,914 @@ def compute_model_fdr(data_in): return fdr +def compute_grouped_model_fdr(data_in, group_keys, log_prefix="Grouped FDR"): + """ + Compute model-based FDR estimates independently within each group. + + Args: + data_in (array-like): Input posterior error probabilities. + group_keys (array-like): Group label per row in ``data_in``. + log_prefix (str): Prefix used for group-count logging. + + Returns: + np.ndarray: FDR estimates for the input data, grouped by ``group_keys``. + """ + data = np.asarray(data_in, dtype=float) + groups = pd.Series(group_keys).fillna("NA").astype(str) + if len(groups) != len(data): + raise ValueError("group_keys must have the same length as data_in.") + + qvalues = np.empty(len(data), dtype=float) + counts = groups.value_counts().sort_index() + logger.info( + f"{log_prefix}: " + + ", ".join(f"{group}={count}" for group, count in counts.items()) + ) + + for group, idx in groups.groupby(groups).groups.items(): + idx_arr = np.fromiter(idx, dtype=int) + qvalues[idx_arr] = compute_model_fdr(data[idx_arr]) + + return qvalues + + +def _support_phospho_loss_group_keys(metrics): + """ + Build coarse FDR strata from support strength and phospho-loss support. + + Support is grouped into <=1, 2, and >=3 supporting transitions. Each support + stratum is then split by whether at least one phospho-loss supporting + transition is present. + """ + supporting = metrics["supporting_transitions"].fillna(0).astype(int).to_numpy() + phospho_loss = ( + metrics["phospho_loss_supporting_transitions"].fillna(0).astype(int).to_numpy() + ) + + support_labels = np.where( + supporting <= 1, + "support_le1", + np.where(supporting == 2, "support_eq2", "support_ge3"), + ) + phospho_labels = np.where(phospho_loss > 0, "phloss", "no_phloss") + return pd.Series( + support_labels + "__" + phospho_labels, + index=metrics.index, + dtype="object", + ) + + +def compute_ipf_qvalues( + pf_pp_data, + grouped_fdr=False, + grouped_fdr_strategy="num_peptidoforms", + grouping_metrics=None, +): + """ + Compute IPF q-values using pooled or grouped model-based FDR. + + Grouped FDR currently supports: + - ``num_peptidoforms``: the legacy grouping by per-feature peptidoform count + - ``support_phospho_loss``: grouping by supporting-transition bin and + phospho-loss support + """ + if not grouped_fdr: + return compute_model_fdr(pf_pp_data["pep"]) + + if grouped_fdr_strategy == "num_peptidoforms": + if "num_peptidoforms" not in pf_pp_data.columns: + raise ValueError( + "num_peptidoforms is required for grouped FDR with strategy 'num_peptidoforms'." + ) + return compute_grouped_model_fdr( + pf_pp_data["pep"], + pf_pp_data["num_peptidoforms"].fillna(-1).astype(int), + log_prefix="Grouped FDR by num_peptidoforms", + ) + + if grouped_fdr_strategy == "support_phospho_loss": + if grouping_metrics is None: + raise ValueError( + "grouping_metrics is required for grouped FDR with strategy 'support_phospho_loss'." + ) + required_cols = { + "feature_id", + "peptide_id", + "supporting_transitions", + "phospho_loss_supporting_transitions", + } + missing = required_cols - set(grouping_metrics.columns) + if missing: + raise ValueError( + "grouping_metrics is missing required columns for grouped FDR " + f"strategy 'support_phospho_loss': {sorted(missing)}" + ) + + merged = pf_pp_data.merge( + grouping_metrics[ + [ + "feature_id", + "peptide_id", + "supporting_transitions", + "phospho_loss_supporting_transitions", + ] + ].drop_duplicates(), + left_on=["feature_id", "hypothesis"], + right_on=["feature_id", "peptide_id"], + how="left", + ) + group_keys = _support_phospho_loss_group_keys(merged) + return compute_grouped_model_fdr( + merged["pep"], + group_keys, + log_prefix="Grouped FDR by support/phospho-loss", + ) + + raise ValueError( + "Unsupported ipf_grouped_fdr_strategy: " + f"{grouped_fdr_strategy!r}. Expected 'num_peptidoforms' or 'support_phospho_loss'." + ) + + +def compute_post_ipf_filter_metrics(transition_table, precursor_table): + """ + Computes per-feature / per-hypothesis metrics used for optional post-IPF filtering. + + Args: + transition_table (pd.DataFrame): Transition-level peptidoform table before Bayesian modeling. + precursor_table (pd.DataFrame): Peakgroup / precursor-level table, optionally including feature_ms2_intensity. + + Returns: + pd.DataFrame: One row per feature_id + peptide_id with supporting transition + counts and optional feature_ms2_intensity. + """ + hypotheses = transition_table.loc[ + transition_table["peptide_id"] != -1, ["feature_id", "peptide_id"] + ].drop_duplicates() + + supporting_cols = ["feature_id", "peptide_id", "transition_id"] + supporting_cols.extend( + [ + col + for col in [ + "n_mapped_peptides", + "has_phospho_loss", + "isotope_overlap_score", + ] + if col in transition_table.columns + ] + ) + supporting_rows = transition_table.loc[ + (transition_table["peptide_id"] != -1) & (transition_table["bmask"] == 1), + supporting_cols, + ].drop_duplicates() + + supporting = ( + supporting_rows.groupby(["feature_id", "peptide_id"], as_index=False)[ + "transition_id" + ] + .nunique() + .rename(columns={"transition_id": "supporting_transitions"}) + ) + + if "n_mapped_peptides" in supporting_rows.columns: + unique_supporting = ( + supporting_rows.loc[supporting_rows["n_mapped_peptides"] == 1] + .groupby(["feature_id", "peptide_id"], as_index=False)["transition_id"] + .nunique() + .rename(columns={"transition_id": "unique_supporting_transitions"}) + ) + else: + unique_supporting = pd.DataFrame( + columns=["feature_id", "peptide_id", "unique_supporting_transitions"] + ) + + if "has_phospho_loss" in supporting_rows.columns: + phospho_loss_supporting = ( + supporting_rows.loc[supporting_rows["has_phospho_loss"] == 1] + .groupby(["feature_id", "peptide_id"], as_index=False)["transition_id"] + .nunique() + .rename( + columns={"transition_id": "phospho_loss_supporting_transitions"} + ) + ) + else: + phospho_loss_supporting = pd.DataFrame( + columns=[ + "feature_id", + "peptide_id", + "phospho_loss_supporting_transitions", + ] + ) + + if "isotope_overlap_score" in supporting_rows.columns: + supporting_overlap = ( + supporting_rows.groupby(["feature_id", "peptide_id"], as_index=False)[ + "isotope_overlap_score" + ] + .median() + .rename( + columns={ + "isotope_overlap_score": "median_supporting_isotope_overlap" + } + ) + ) + else: + supporting_overlap = pd.DataFrame( + columns=[ + "feature_id", + "peptide_id", + "median_supporting_isotope_overlap", + ] + ) + + metrics = ( + hypotheses.merge(supporting, on=["feature_id", "peptide_id"], how="left") + .merge(unique_supporting, on=["feature_id", "peptide_id"], how="left") + .merge( + phospho_loss_supporting, + on=["feature_id", "peptide_id"], + how="left", + ) + .merge(supporting_overlap, on=["feature_id", "peptide_id"], how="left") + .fillna( + { + "supporting_transitions": 0, + "unique_supporting_transitions": 0, + "phospho_loss_supporting_transitions": 0, + } + ) + ) + metrics["supporting_transitions"] = metrics["supporting_transitions"].astype(int) + metrics["unique_supporting_transitions"] = metrics[ + "unique_supporting_transitions" + ].astype(int) + metrics["phospho_loss_supporting_transitions"] = metrics[ + "phospho_loss_supporting_transitions" + ].astype(int) + + if "feature_ms2_intensity" in precursor_table.columns: + metrics = metrics.merge( + precursor_table[["feature_id", "feature_ms2_intensity"]].drop_duplicates(), + on="feature_id", + how="left", + ) + + return metrics + + +def prepare_post_ipf_filter_metrics( + transition_table, + precursor_table, + propagate_signal_across_runs=False, + across_run_confidence_threshold=0.5, +): + """ + Prepares post-IPF filter metrics from the same transition evidence state used by IPF. + + When across-run propagation is enabled, supporting-transition counts are computed on the + propagated evidence table so the post-IPF filter matches the final inference behavior. + + Args: + transition_table (pd.DataFrame): Transition-level peptidoform table. + precursor_table (pd.DataFrame): Peakgroup / precursor-level table. + propagate_signal_across_runs (bool): Whether IPF propagates evidence across runs. + across_run_confidence_threshold (float): Confidence threshold for signal propagation. + + Returns: + pd.DataFrame: Metrics from compute_post_ipf_filter_metrics(). + """ + filter_transition_table = transition_table.copy() + + if propagate_signal_across_runs: + non_prop_data = filter_transition_table.loc[ + filter_transition_table["feature_id"] + == filter_transition_table["alignment_group_id"] + ] + prop_data = filter_transition_table.loc[ + filter_transition_table["feature_id"] + != filter_transition_table["alignment_group_id"] + ] + + if len(prop_data) > 0: + group_cols = [ + "feature_id", + "transition_id", + "peptide_id", + "bmask", + "num_peptidoforms", + "alignment_group_id", + ] + group_cols.extend( + [ + col + for col in [ + "n_mapped_peptides", + "has_phospho_loss", + "isotope_overlap_score", + ] + if col in filter_transition_table.columns + ] + ) + propagated_data = ( + prop_data.groupby("alignment_group_id", group_keys=False) + .apply( + lambda df: transfer_confident_evidence_across_runs( + df, + across_run_confidence_threshold, + group_cols=group_cols, + value_cols=["pep"], + ) + ) + .reset_index(drop=True) + ) + filter_transition_table = pd.concat( + [non_prop_data, propagated_data], ignore_index=True + ) + else: + filter_transition_table = non_prop_data.copy() + + return compute_post_ipf_filter_metrics(filter_transition_table, precursor_table) + + +def _ipf_filters_active( + min_supporting_transitions=0, + min_unique_supporting_transitions=0, + require_phospho_loss_below_support=0, + min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity=0.0, +): + return ( + min_supporting_transitions > 0 + or min_unique_supporting_transitions > 0 + or require_phospho_loss_below_support > 0 + or min_peakgroup_intensity > 0 + or conditional_min_peakgroup_intensity > 0 + ) + + +def _ipf_score_adjustment_active( + score_support_log_weight=0.0, + score_unique_support_log_weight=0.0, + score_log_intensity_weight=0.0, + score_phospho_loss_bonus=0.0, + score_overlap_penalty_weight=0.0, +): + return ( + score_support_log_weight != 0.0 + or score_unique_support_log_weight != 0.0 + or score_log_intensity_weight != 0.0 + or score_phospho_loss_bonus != 0.0 + or score_overlap_penalty_weight != 0.0 + ) + + +def _transition_evidence_adjustment_active( + transition_evidence_nonunique_logit_scale=1.0, + transition_evidence_no_phospho_loss_logit_scale=1.0, + transition_evidence_overlap_logit_scale_weight=0.0, + transition_evidence_max_isotope_overlap=0.0, +): + return ( + transition_evidence_nonunique_logit_scale != 1.0 + or transition_evidence_no_phospho_loss_logit_scale != 1.0 + or transition_evidence_overlap_logit_scale_weight != 0.0 + or transition_evidence_max_isotope_overlap > 0.0 + ) + + +def apply_transition_evidence_adjustment( + transition_table, + transition_evidence_nonunique_logit_scale=1.0, + transition_evidence_no_phospho_loss_logit_scale=1.0, + transition_evidence_overlap_logit_scale_weight=0.0, + transition_evidence_overlap_reference=0.02, + transition_evidence_max_isotope_overlap=0.0, +): + """ + Adjust transition-level PEPs before BM so weak/non-informative identifying transitions + contribute less extreme evidence. + + The adjustment shrinks the transition PEP logit toward zero, which weakens both supporting + and opposing evidence symmetrically once the BM evidence is constructed from PEP and BMASK. + Optionally, high-overlap transitions can be removed entirely before BM. + """ + if not _transition_evidence_adjustment_active( + transition_evidence_nonunique_logit_scale=transition_evidence_nonunique_logit_scale, + transition_evidence_no_phospho_loss_logit_scale=transition_evidence_no_phospho_loss_logit_scale, + transition_evidence_overlap_logit_scale_weight=transition_evidence_overlap_logit_scale_weight, + transition_evidence_max_isotope_overlap=transition_evidence_max_isotope_overlap, + ): + return transition_table + + if transition_evidence_nonunique_logit_scale <= 0: + raise ValueError("transition_evidence_nonunique_logit_scale must be > 0.") + if transition_evidence_no_phospho_loss_logit_scale <= 0: + raise ValueError( + "transition_evidence_no_phospho_loss_logit_scale must be > 0." + ) + if transition_evidence_overlap_logit_scale_weight < 0: + raise ValueError( + "transition_evidence_overlap_logit_scale_weight must be >= 0." + ) + if transition_evidence_overlap_reference < 0: + raise ValueError("transition_evidence_overlap_reference must be >= 0.") + if transition_evidence_max_isotope_overlap < 0: + raise ValueError("transition_evidence_max_isotope_overlap must be >= 0.") + + adjusted = transition_table.copy() + + if transition_evidence_max_isotope_overlap > 0: + if "isotope_overlap_score" not in adjusted.columns: + raise ValueError( + "isotope_overlap_score is required for transition_evidence_max_isotope_overlap filtering." + ) + before = len(adjusted) + adjusted = adjusted[ + adjusted["isotope_overlap_score"].fillna(transition_evidence_overlap_reference) + <= transition_evidence_max_isotope_overlap + ].copy() + logger.info( + "Applied pre-BM transition overlap filter: " + f"kept {len(adjusted)}/{before} transition rows with isotope_overlap_score <= " + f"{transition_evidence_max_isotope_overlap}." + ) + + if "pep" not in adjusted.columns: + raise ValueError("Transition table must contain pep for evidence adjustment.") + + scales = np.ones(len(adjusted), dtype=float) + + if transition_evidence_nonunique_logit_scale != 1.0: + if "n_mapped_peptides" not in adjusted.columns: + raise ValueError( + "n_mapped_peptides is required for transition_evidence_nonunique_logit_scale." + ) + scales *= np.where( + adjusted["n_mapped_peptides"].fillna(0).astype(int).to_numpy() > 1, + transition_evidence_nonunique_logit_scale, + 1.0, + ) + + if transition_evidence_no_phospho_loss_logit_scale != 1.0: + if "has_phospho_loss" not in adjusted.columns: + raise ValueError( + "has_phospho_loss is required for transition_evidence_no_phospho_loss_logit_scale." + ) + scales *= np.where( + adjusted["has_phospho_loss"].fillna(0).astype(int).to_numpy() > 0, + 1.0, + transition_evidence_no_phospho_loss_logit_scale, + ) + + if transition_evidence_overlap_logit_scale_weight != 0.0: + if "isotope_overlap_score" not in adjusted.columns: + raise ValueError( + "isotope_overlap_score is required for transition_evidence_overlap_logit_scale_weight." + ) + overlap = ( + adjusted["isotope_overlap_score"] + .fillna(transition_evidence_overlap_reference) + .astype(float) + .clip(lower=0.0) + .to_numpy() + ) + scales *= np.exp( + -transition_evidence_overlap_logit_scale_weight + * np.maximum(0.0, overlap - transition_evidence_overlap_reference) + ) + + eps = np.finfo(float).eps + pep = adjusted["pep"].astype(float).clip(lower=eps, upper=1 - eps).to_numpy() + pep_logit = np.log(pep / (1 - pep)) + adjusted["pep"] = expit(pep_logit * scales) + + logger.info( + "Applied pre-BM transition evidence shaping: " + f"nonunique_logit_scale={transition_evidence_nonunique_logit_scale}, " + f"no_phospho_loss_logit_scale={transition_evidence_no_phospho_loss_logit_scale}, " + f"overlap_logit_scale_weight={transition_evidence_overlap_logit_scale_weight}, " + f"overlap_reference={transition_evidence_overlap_reference}, " + f"max_isotope_overlap={transition_evidence_max_isotope_overlap}. " + f"Adjusted {len(adjusted)} transition rows." + ) + + return adjusted + + +def apply_ipf_score_adjustment( + result, + score_metrics, + ipf_grouped_fdr=False, + score_support_log_weight=0.0, + score_unique_support_log_weight=0.0, + score_log_intensity_weight=0.0, + score_phospho_loss_bonus=0.0, + score_intensity_reference=10000.0, + score_overlap_penalty_weight=0.0, + score_overlap_reference=0.02, + score_adjust_max_supporting_transitions=0, + score_adjust_no_phospho_loss_only=False, +): + """ + Experimentally re-rank IPF peptidoform hypotheses by adjusting the PEP in logit space + using support-count and feature-intensity signals, then recomputing q-values. + + Positive weights improve hypotheses with stronger support / higher intensity. When + score_adjust_max_supporting_transitions is set, the adjustment only applies to weak- + support hypotheses at or below that threshold. When score_adjust_no_phospho_loss_only + is enabled, only hypotheses without phospho-loss support are adjusted. + """ + if not _ipf_score_adjustment_active( + score_support_log_weight=score_support_log_weight, + score_unique_support_log_weight=score_unique_support_log_weight, + score_log_intensity_weight=score_log_intensity_weight, + score_phospho_loss_bonus=score_phospho_loss_bonus, + score_overlap_penalty_weight=score_overlap_penalty_weight, + ): + return result + + if score_intensity_reference <= 0: + raise ValueError("score_intensity_reference must be > 0.") + if score_overlap_reference < 0: + raise ValueError("score_overlap_reference must be >= 0.") + + merged = result.merge( + score_metrics, + left_on=["feature_id", "hypothesis"], + right_on=["feature_id", "peptide_id"], + how="left", + ) + + for metric_col in [ + "supporting_transitions", + "unique_supporting_transitions", + "phospho_loss_supporting_transitions", + ]: + if metric_col not in merged.columns: + merged[metric_col] = 0 + merged[metric_col] = merged[metric_col].fillna(0).astype(int) + + if score_log_intensity_weight != 0.0 and "feature_ms2_intensity" not in merged.columns: + raise ValueError( + "feature_ms2_intensity is required for support/intensity-aware IPF score adjustment." + ) + if score_overlap_penalty_weight != 0.0 and "median_supporting_isotope_overlap" not in merged.columns: + raise ValueError( + "median_supporting_isotope_overlap is required for overlap-aware IPF score adjustment." + ) + + apply_mask = merged["hypothesis"] != -1 + if score_adjust_max_supporting_transitions > 0: + apply_mask &= ( + merged["supporting_transitions"] <= score_adjust_max_supporting_transitions + ) + if score_adjust_no_phospho_loss_only: + apply_mask &= merged["phospho_loss_supporting_transitions"] == 0 + + score_bonus = np.zeros(len(merged), dtype=float) + if score_support_log_weight != 0.0: + score_bonus += score_support_log_weight * np.log1p( + merged["supporting_transitions"].astype(float).to_numpy() + ) + if score_unique_support_log_weight != 0.0: + score_bonus += score_unique_support_log_weight * np.log1p( + merged["unique_supporting_transitions"].astype(float).to_numpy() + ) + if score_log_intensity_weight != 0.0: + intensity = ( + merged["feature_ms2_intensity"] + .fillna(0.0) + .astype(float) + .clip(lower=1.0) + .to_numpy() + ) + score_bonus += score_log_intensity_weight * ( + np.log10(intensity) - np.log10(score_intensity_reference) + ) + if score_phospho_loss_bonus != 0.0: + score_bonus += score_phospho_loss_bonus * ( + merged["phospho_loss_supporting_transitions"].astype(float).to_numpy() > 0 + ) + if score_overlap_penalty_weight != 0.0: + overlap = ( + merged["median_supporting_isotope_overlap"] + .fillna(score_overlap_reference) + .astype(float) + .clip(lower=0.0) + .to_numpy() + ) + score_bonus -= score_overlap_penalty_weight * np.maximum( + 0.0, overlap - score_overlap_reference + ) + + score_bonus = np.where(apply_mask.to_numpy(), score_bonus, 0.0) + + eps = np.finfo(float).eps + pep = merged["pep"].astype(float).clip(lower=eps, upper=1 - eps).to_numpy() + pep_logit = np.log(pep / (1 - pep)) + adjusted_pep = expit(pep_logit - score_bonus) + adjusted_pep = np.clip(adjusted_pep, eps, 1 - eps) + + adjusted = merged.copy() + adjusted["pep"] = adjusted_pep + + if ipf_grouped_fdr: + if "num_peptidoforms" not in adjusted.columns: + raise ValueError( + "num_peptidoforms is required for grouped FDR after IPF score adjustment." + ) + adjusted["qvalue"] = adjusted.groupby("num_peptidoforms")["pep"].transform( + compute_model_fdr + ) + else: + adjusted["qvalue"] = compute_model_fdr(adjusted["pep"]) + + logger.info( + "Applied support/intensity-aware IPF score adjustment: " + f"support_log_weight={score_support_log_weight}, " + f"unique_support_log_weight={score_unique_support_log_weight}, " + f"log_intensity_weight={score_log_intensity_weight}, " + f"phospho_loss_bonus={score_phospho_loss_bonus}, " + f"intensity_reference={score_intensity_reference}, " + f"overlap_penalty_weight={score_overlap_penalty_weight}, " + f"overlap_reference={score_overlap_reference}, " + f"max_supporting_transitions={score_adjust_max_supporting_transitions}, " + f"no_phospho_loss_only={score_adjust_no_phospho_loss_only}. " + f"Adjusted {int(apply_mask.sum())}/{len(adjusted)} hypothesis rows." + ) + + return adjusted + + +def _apply_ipf_filter_thresholds( + metrics, + min_supporting_transitions=0, + min_unique_supporting_transitions=0, + require_phospho_loss_below_support=0, + min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=0, + conditional_min_peakgroup_intensity_no_phospho_loss_only=False, + log_prefix="Applied IPF", +): + filtered = metrics.copy() + + for metric_col in [ + "supporting_transitions", + "unique_supporting_transitions", + "phospho_loss_supporting_transitions", + ]: + if metric_col not in filtered.columns: + filtered[metric_col] = 0 + filtered[metric_col] = filtered[metric_col].fillna(0).astype(int) + + if min_supporting_transitions > 0: + before = len(filtered) + filtered = filtered[ + filtered["supporting_transitions"] >= min_supporting_transitions + ].copy() + logger.info( + f"{log_prefix} supporting-transition filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with supporting_transitions >= {min_supporting_transitions}." + ) + + if min_unique_supporting_transitions > 0: + before = len(filtered) + filtered = filtered[ + filtered["unique_supporting_transitions"] >= min_unique_supporting_transitions + ].copy() + logger.info( + f"{log_prefix} unique-supporting-transition filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with unique_supporting_transitions >= {min_unique_supporting_transitions}." + ) + + if require_phospho_loss_below_support > 0: + before = len(filtered) + filtered = filtered[ + (filtered["supporting_transitions"] >= require_phospho_loss_below_support) + | (filtered["phospho_loss_supporting_transitions"] > 0) + ].copy() + logger.info( + f"{log_prefix} phospho-loss rescue filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with supporting_transitions >= {require_phospho_loss_below_support} " + "or at least one phospho-loss supporting transition." + ) + + if min_peakgroup_intensity > 0: + if "feature_ms2_intensity" not in filtered.columns: + raise ValueError( + "feature_ms2_intensity is required for ipf_min_peakgroup_intensity filtering." + ) + before = len(filtered) + filtered = filtered[ + filtered["feature_ms2_intensity"] >= min_peakgroup_intensity + ].copy() + logger.info( + f"{log_prefix} peakgroup-intensity filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"with feature_ms2_intensity >= {min_peakgroup_intensity}." + ) + + if conditional_min_peakgroup_intensity > 0: + if conditional_min_peakgroup_intensity_max_supporting_transitions <= 0: + raise ValueError( + "ipf_conditional_min_peakgroup_intensity_max_supporting_transitions must be > 0 " + "when ipf_conditional_min_peakgroup_intensity is enabled." + ) + if "feature_ms2_intensity" not in filtered.columns: + raise ValueError( + "feature_ms2_intensity is required for ipf_conditional_min_peakgroup_intensity filtering." + ) + + before = len(filtered) + weak_support_mask = ( + filtered["supporting_transitions"] + <= conditional_min_peakgroup_intensity_max_supporting_transitions + ) + if conditional_min_peakgroup_intensity_no_phospho_loss_only: + weak_support_mask = weak_support_mask & ( + filtered["phospho_loss_supporting_transitions"] == 0 + ) + + filtered = filtered[ + (~weak_support_mask) + | ( + filtered["feature_ms2_intensity"] + >= conditional_min_peakgroup_intensity + ) + ].copy() + logger.info( + f"{log_prefix} conditional peakgroup-intensity filter: " + f"kept {len(filtered)}/{before} feature-hypothesis rows " + f"after requiring feature_ms2_intensity >= {conditional_min_peakgroup_intensity} " + f"for rows with supporting_transitions <= " + f"{conditional_min_peakgroup_intensity_max_supporting_transitions}" + + ( + " and no phospho-loss supporting transitions." + if conditional_min_peakgroup_intensity_no_phospho_loss_only + else "." + ) + ) + + return filtered + + +def apply_pre_bm_hypothesis_filters( + transition_table, + precursor_table, + propagate_signal_across_runs=False, + across_run_confidence_threshold=0.5, + min_supporting_transitions=0, + min_unique_supporting_transitions=0, + require_phospho_loss_below_support=0, + min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=0, + conditional_min_peakgroup_intensity_no_phospho_loss_only=False, +): + """ + Applies optional structural hypothesis filters before transition-level Bayesian modeling. + + The same feature/peptidoform metrics used for post-IPF filtering are computed on the + current evidence state and used to prune weak hypotheses before posterior inference. + Retained rows have their num_peptidoforms recomputed so priors remain coherent. + + Args: + transition_table (pd.DataFrame): Transition-level peptidoform table. + precursor_table (pd.DataFrame): Peakgroup / precursor-level table. + propagate_signal_across_runs (bool): Whether IPF propagates evidence across runs. + across_run_confidence_threshold (float): Confidence threshold for signal propagation. + min_supporting_transitions (int): Minimum supporting transitions required. + min_unique_supporting_transitions (int): Minimum uniquely supporting transitions required. + require_phospho_loss_below_support (int): Require at least one phospho-loss supporting + transition when supporting_transitions is below this threshold. + min_peakgroup_intensity (float): Minimum MS2 feature intensity required. + + Returns: + pd.DataFrame: Filtered transition table ready for peptidoform inference. + """ + if not _ipf_filters_active( + min_supporting_transitions=min_supporting_transitions, + min_unique_supporting_transitions=min_unique_supporting_transitions, + require_phospho_loss_below_support=require_phospho_loss_below_support, + min_peakgroup_intensity=min_peakgroup_intensity, + conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, + ): + return transition_table + + filter_metrics = prepare_post_ipf_filter_metrics( + transition_table, + precursor_table, + propagate_signal_across_runs=propagate_signal_across_runs, + across_run_confidence_threshold=across_run_confidence_threshold, + ) + kept_hypotheses = _apply_ipf_filter_thresholds( + filter_metrics, + min_supporting_transitions=min_supporting_transitions, + min_unique_supporting_transitions=min_unique_supporting_transitions, + require_phospho_loss_below_support=require_phospho_loss_below_support, + min_peakgroup_intensity=min_peakgroup_intensity, + conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, + conditional_min_peakgroup_intensity_max_supporting_transitions=conditional_min_peakgroup_intensity_max_supporting_transitions, + conditional_min_peakgroup_intensity_no_phospho_loss_only=conditional_min_peakgroup_intensity_no_phospho_loss_only, + log_prefix="Applied pre-BM", + )[["feature_id", "peptide_id"]].drop_duplicates() + + filtered = transition_table.merge( + kept_hypotheses.assign(_keep_hypothesis=True), + on=["feature_id", "peptide_id"], + how="left", + ) + filtered = filtered[ + (filtered["peptide_id"] == -1) | (filtered["_keep_hypothesis"] == True) + ].copy() + filtered = filtered.drop(columns=["_keep_hypothesis"]) + + num_peptidoforms = ( + filtered.loc[filtered["peptide_id"] != -1] + .groupby("feature_id")["peptide_id"] + .nunique() + .rename("num_peptidoforms") + ) + filtered = filtered.drop(columns=["num_peptidoforms"], errors="ignore").merge( + num_peptidoforms, + on="feature_id", + how="left", + ) + filtered["num_peptidoforms"] = filtered["num_peptidoforms"].fillna(0).astype(int) + + logger.info( + "Applied pre-BM hypothesis filtering: " + f"kept {len(filtered)} transition rows spanning " + f"{filtered.loc[filtered['peptide_id'] != -1, ['feature_id', 'peptide_id']].drop_duplicates().shape[0]} " + "non-H0 feature-hypothesis candidates." + ) + + return filtered + + +def apply_post_ipf_filters( + result, + filter_metrics, + min_supporting_transitions=0, + min_unique_supporting_transitions=0, + require_phospho_loss_below_support=0, + min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity=0.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=0, + conditional_min_peakgroup_intensity_no_phospho_loss_only=False, +): + """ + Applies optional post-IPF filters to inferred peptidoform results. + + Args: + result (pd.DataFrame): Inferred peptidoform results with FEATURE_ID / PEPTIDE_ID. + filter_metrics (pd.DataFrame): Metrics from compute_post_ipf_filter_metrics(). + min_supporting_transitions (int): Minimum supporting transitions required. + min_unique_supporting_transitions (int): Minimum uniquely supporting transitions required. + require_phospho_loss_below_support (int): Require at least one phospho-loss supporting + transition when supporting_transitions is below this threshold. + min_peakgroup_intensity (float): Minimum MS2 feature intensity required. + + Returns: + pd.DataFrame: Filtered peptidoform results. + """ + if not _ipf_filters_active( + min_supporting_transitions=min_supporting_transitions, + min_unique_supporting_transitions=min_unique_supporting_transitions, + require_phospho_loss_below_support=require_phospho_loss_below_support, + min_peakgroup_intensity=min_peakgroup_intensity, + conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, + ): + return result + + merged = result.merge( + filter_metrics, + left_on=["FEATURE_ID", "PEPTIDE_ID"], + right_on=["feature_id", "peptide_id"], + how="left", + ) + merged = _apply_ipf_filter_thresholds( + merged, + min_supporting_transitions=min_supporting_transitions, + min_unique_supporting_transitions=min_unique_supporting_transitions, + require_phospho_loss_below_support=require_phospho_loss_below_support, + min_peakgroup_intensity=min_peakgroup_intensity, + conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, + conditional_min_peakgroup_intensity_max_supporting_transitions=conditional_min_peakgroup_intensity_max_supporting_transitions, + conditional_min_peakgroup_intensity_no_phospho_loss_only=conditional_min_peakgroup_intensity_no_phospho_loss_only, + log_prefix="Applied post-IPF", + ) + + return merged[ + ["FEATURE_ID", "PEPTIDE_ID", "PRECURSOR_PEAKGROUP_PEP", "QVALUE", "PEP"] + ].copy() + + def prepare_precursor_bm(data): """ Prepares Bayesian model data for precursor-level inference. @@ -282,16 +1191,72 @@ def prepare_transition_bm( return data -def apply_bm(data): +def apply_bm(data, use_log_space=False, evidence_epsilon=0.0): """ Applies the Bayesian model to compute posterior probabilities. Args: data (pd.DataFrame): Input Bayesian model data. + use_log_space (bool): Whether to compute Bayesian posteriors in log-space. + evidence_epsilon (float): Optional clipping epsilon applied to evidence values + before inference. 0 disables clipping. Returns: pd.DataFrame: Data with posterior probabilities for each hypothesis. """ + if evidence_epsilon < 0 or evidence_epsilon >= 0.5: + raise ValueError("evidence_epsilon must satisfy 0 <= epsilon < 0.5.") + + if evidence_epsilon > 0: + data = data.copy() + data["evidence"] = data["evidence"].clip( + lower=evidence_epsilon, upper=1 - evidence_epsilon + ) + + if use_log_space: + with np.errstate(divide="ignore", invalid="ignore"): + grouped_logs = ( + data.assign(log_evidence=np.log(data["evidence"])) + .groupby(["feature_id", "hypothesis"])["log_evidence"] + .sum() + .reset_index() + ) + grouped_prior = ( + data.groupby(["feature_id", "hypothesis"], as_index=False)["prior"] + .min() + ) + grouped_logs = grouped_logs.merge( + grouped_prior, on=["feature_id", "hypothesis"], how="left" + ) + grouped_logs["log_prior"] = np.log(grouped_logs["prior"]) + grouped_logs["log_likelihood_prior"] = ( + grouped_logs["log_evidence"] + grouped_logs["log_prior"] + ) + grouped_logs["log_likelihood_sum"] = grouped_logs.groupby("feature_id")[ + "log_likelihood_prior" + ].transform(logsumexp) + grouped_logs["posterior"] = np.exp( + grouped_logs["log_likelihood_prior"] + - grouped_logs["log_likelihood_sum"] + ) + grouped_logs["likelihood_prior"] = np.exp( + grouped_logs["log_likelihood_prior"] + ) + grouped_logs["likelihood_sum"] = np.exp( + grouped_logs["log_likelihood_sum"] + ) + + pp_data = grouped_logs[ + [ + "feature_id", + "hypothesis", + "likelihood_prior", + "likelihood_sum", + "posterior", + ] + ] + return pp_data.fillna(value=0) + # compute likelihood * prior per feature & hypothesis # all priors are identical but pandas DF multiplication requires aggregation, so we use min() pp_data = ( @@ -317,6 +1282,8 @@ def precursor_inference( ipf_ms2_scoring, ipf_max_precursor_pep, ipf_max_precursor_peakgroup_pep, + use_log_space_bm=False, + bm_evidence_epsilon=0.0, ): """ Conducts precursor-level inference. @@ -327,6 +1294,8 @@ def precursor_inference( ipf_ms2_scoring (bool): Whether to use MS2-level scoring. ipf_max_precursor_pep (float): Maximum PEP threshold for precursors. ipf_max_precursor_peakgroup_pep (float): Maximum PEP threshold for peak groups. + use_log_space_bm (bool): Whether to compute Bayesian posteriors in log-space. + bm_evidence_epsilon (float): Optional clipping epsilon applied to BM evidence. Returns: pd.DataFrame: Inferred precursor probabilities. @@ -364,7 +1333,11 @@ def precursor_inference( # compute posterior precursor probability logger.info("Conducting precursor-level inference ... ") - prec_pp_data = apply_bm(precursor_data_bm) + prec_pp_data = apply_bm( + precursor_data_bm, + use_log_space=use_log_space_bm, + evidence_epsilon=bm_evidence_epsilon, + ) prec_pp_data["precursor_peakgroup_pep"] = 1 - prec_pp_data["posterior"] inferred_precursors = prec_pp_data[prec_pp_data["hypothesis"]][ @@ -392,8 +1365,10 @@ def peptidoform_inference( transition_table, precursor_data, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, propagate_signal_across_runs, across_run_confidence_threshold, + grouping_metrics=None, ): """ Conducts peptidoform-level inference. @@ -402,8 +1377,11 @@ def peptidoform_inference( transition_table (pd.DataFrame): Input data containing transition-level information. precursor_data (pd.DataFrame): Precursor-level probabilities. ipf_grouped_fdr (bool): Whether to use grouped FDR estimation. + ipf_grouped_fdr_strategy (str): Grouping strategy when grouped FDR is enabled. propagate_signal_across_runs (bool): Whether to propagate signal across runs. across_run_confidence_threshold (float): Confidence threshold for propagation. + grouping_metrics (pd.DataFrame | None): Optional per-feature/per-hypothesis + metrics required by grouped FDR strategies beyond ``num_peptidoforms``. Returns: pd.DataFrame: Inferred peptidoform probabilities and FDR estimates. @@ -421,23 +1399,19 @@ def peptidoform_inference( pf_pp_data = apply_bm(transition_data_bm) pf_pp_data["pep"] = 1 - pf_pp_data["posterior"] + pf_pp_data = pf_pp_data.merge( + transition_data_bm[["feature_id", "num_peptidoforms"]].drop_duplicates(), + on=["feature_id"], + how="left", + ) # compute model-based FDR - if ipf_grouped_fdr: - pf_pp_data["qvalue"] = ( - pd.merge( - pf_pp_data, - transition_data_bm[ - ["feature_id", "num_peptidoforms"] - ].drop_duplicates(), - on=["feature_id"], - how="inner", - ) - .groupby("num_peptidoforms")["pep"] - .transform(compute_model_fdr) - ) - else: - pf_pp_data["qvalue"] = compute_model_fdr(pf_pp_data["pep"]) + pf_pp_data["qvalue"] = compute_ipf_qvalues( + pf_pp_data, + grouped_fdr=ipf_grouped_fdr, + grouped_fdr_strategy=ipf_grouped_fdr_strategy, + grouping_metrics=grouping_metrics, + ) # merge precursor-level data with UIS data result = pf_pp_data.merge( @@ -493,12 +1467,31 @@ def infer_peptidoforms(config: IPFIOConfig): peptidoform_table = peptidoform_table.astype({"alignment_group_id": "int64"}) + transition_score_metrics = None + need_transition_score_metrics = ( + config.ipf_min_supporting_transitions > 0 + or config.ipf_min_peakgroup_intensity > 0 + or ( + config.ipf_grouped_fdr + and config.ipf_grouped_fdr_strategy == "support_phospho_loss" + ) + ) + if need_transition_score_metrics: + transition_score_metrics = prepare_post_ipf_filter_metrics( + peptidoform_table, + precursor_table, + propagate_signal_across_runs=config.propagate_signal_across_runs, + across_run_confidence_threshold=config.across_run_confidence_threshold, + ) + peptidoform_data = peptidoform_inference( peptidoform_table, precursor_data, config.ipf_grouped_fdr, + config.ipf_grouped_fdr_strategy, config.propagate_signal_across_runs, config.across_run_confidence_threshold, + grouping_metrics=transition_score_metrics, ) # finalize results and write to table @@ -516,6 +1509,12 @@ def infer_peptidoforms(config: IPFIOConfig): # Convert feature_id to int64 peptidoform_data = peptidoform_data.astype({"FEATURE_ID": "int64"}) + peptidoform_data = apply_post_ipf_filters( + peptidoform_data, + transition_score_metrics, + min_supporting_transitions=config.ipf_min_supporting_transitions, + min_peakgroup_intensity=config.ipf_min_peakgroup_intensity, + ) writer = WriterDispatcher.get_writer(config) writer.save_results(result=peptidoform_data) diff --git a/tests/test_ipf.py b/tests/test_ipf.py index 59332d24..783beb45 100644 --- a/tests/test_ipf.py +++ b/tests/test_ipf.py @@ -8,10 +8,15 @@ from numpy.testing import assert_almost_equal from pyprophet.ipf import ( - prepare_precursor_bm, - prepare_transition_bm, apply_bm, + apply_ipf_score_adjustment, + apply_transition_evidence_adjustment, + apply_pre_bm_hypothesis_filters, + apply_post_ipf_filters, compute_model_fdr, + compute_post_ipf_filter_metrics, + prepare_precursor_bm, + prepare_transition_bm, ) @@ -158,3 +163,459 @@ def test_3(): ] ], ) + + +def test_apply_bm_log_space_matches_raw(): + tin = pd.DataFrame( + { + "feature_id": ["id0"] * 6, + "evidence": [0.9, 0.8, 0.7, 0.1, 0.2, 0.3], + "hypothesis": [1, 1, 1, 2, 2, 2], + "prior": [0.5, 0.5, 0.5, 0.5, 0.5, 0.5], + } + ) + + tout_raw = apply_bm(tin) + tout_log = apply_bm(tin, use_log_space=True) + + assert_frame_equal( + tout_raw[ + [ + "feature_id", + "hypothesis", + "likelihood_prior", + "likelihood_sum", + "posterior", + ] + ].sort_values(["feature_id", "hypothesis"]).reset_index(drop=True), + tout_log[ + [ + "feature_id", + "hypothesis", + "likelihood_prior", + "likelihood_sum", + "posterior", + ] + ].sort_values(["feature_id", "hypothesis"]).reset_index(drop=True), + check_exact=False, + atol=1e-12, + rtol=1e-12, + ) + + +def test_apply_bm_log_space_avoids_raw_underflow(): + n = 1000 + tin = pd.DataFrame( + { + "feature_id": ["id0"] * (2 * n), + "evidence": [0.1] * n + [0.2] * n, + "hypothesis": [1] * n + [2] * n, + "prior": [0.5] * (2 * n), + } + ) + + tout_raw = apply_bm(tin) + tout_log = apply_bm(tin, use_log_space=True) + + assert tout_raw["posterior"].sum() == 0 + assert np.isclose(tout_log["posterior"].sum(), 1.0) + assert tout_log.loc[tout_log["hypothesis"] == 2, "posterior"].iloc[0] > 0.999999 + + +def test_apply_bm_evidence_epsilon_clips_extreme_posteriors(): + tin = pd.DataFrame( + { + "feature_id": ["id0"] * 4, + "evidence": [1.0, 1.0, 0.0, 0.0], + "hypothesis": [1, 1, 2, 2], + "prior": [0.5, 0.5, 0.5, 0.5], + } + ) + + tout = apply_bm(tin, evidence_epsilon=1e-6) + + posterior_1 = tout.loc[tout["hypothesis"] == 1, "posterior"].iloc[0] + posterior_2 = tout.loc[tout["hypothesis"] == 2, "posterior"].iloc[0] + + assert 0 < posterior_1 < 1 + assert 0 < posterior_2 < 1 + assert np.isclose(posterior_1 + posterior_2, 1.0) + + +def test_compute_post_ipf_filter_metrics(): + transition_table = pd.DataFrame( + { + "feature_id": [1, 1, 1, 2, 2, 3], + "peptide_id": [10, 10, 11, 20, -1, 30], + "transition_id": [100, 101, 102, 200, 201, 300], + "bmask": [1, 0, 1, 1, 1, 1], + "n_mapped_peptides": [1, 2, 1, 2, 2, 1], + "has_phospho_loss": [1, 0, 0, 1, 0, 0], + "isotope_overlap_score": [0.01, 0.05, 0.02, 0.03, 0.04, 0.01], + } + ) + precursor_table = pd.DataFrame( + { + "feature_id": [1, 2, 3], + "feature_ms2_intensity": [1000.0, 200.0, 50.0], + } + ) + + tout = compute_post_ipf_filter_metrics(transition_table, precursor_table) + tref = pd.DataFrame( + { + "feature_id": [1, 1, 2, 3], + "peptide_id": [10, 11, 20, 30], + "supporting_transitions": [1, 1, 1, 1], + "unique_supporting_transitions": [1, 1, 0, 1], + "phospho_loss_supporting_transitions": [1, 0, 1, 0], + "median_supporting_isotope_overlap": [0.01, 0.02, 0.03, 0.01], + "feature_ms2_intensity": [1000.0, 1000.0, 200.0, 50.0], + } + ) + + assert_frame_equal( + tout.sort_values(["feature_id", "peptide_id"]).reset_index(drop=True), + tref.sort_values(["feature_id", "peptide_id"]).reset_index(drop=True), + ) + + +def test_apply_pre_bm_hypothesis_filters(): + transition_table = pd.DataFrame( + { + "feature_id": [1, 1, 1, 1, 1, 1, 2, 2], + "peptide_id": [10, 10, 11, 11, -1, -1, 20, -1], + "transition_id": [100, 101, 102, 103, 100, 101, 200, 200], + "bmask": [1, 1, 1, 0, 1, 0, 1, 1], + "num_peptidoforms": [2, 2, 2, 2, 2, 2, 1, 1], + "pep": [0.01, 0.02, 0.03, 0.2, 0.05, 0.8, 0.01, 0.05], + "n_mapped_peptides": [1, 1, 1, 1, 1, 1, 1, 1], + "has_phospho_loss": [1, 0, 0, 0, 0, 0, 0, 0], + } + ) + precursor_table = pd.DataFrame( + { + "feature_id": [1, 2], + "feature_ms2_intensity": [1000.0, 50.0], + } + ) + + tout = apply_pre_bm_hypothesis_filters( + transition_table, + precursor_table, + min_supporting_transitions=2, + min_peakgroup_intensity=100.0, + ) + + kept_hypotheses = tout.loc[tout["peptide_id"] != -1, ["feature_id", "peptide_id"]] + kept_hypotheses = kept_hypotheses.drop_duplicates().sort_values( + ["feature_id", "peptide_id"] + ) + tref_hypotheses = pd.DataFrame({"feature_id": [1], "peptide_id": [10]}) + + assert_frame_equal(kept_hypotheses.reset_index(drop=True), tref_hypotheses) + + num_pf = ( + tout[["feature_id", "num_peptidoforms"]] + .drop_duplicates() + .sort_values("feature_id") + .reset_index(drop=True) + ) + tref_num_pf = pd.DataFrame({"feature_id": [1, 2], "num_peptidoforms": [1, 0]}) + + assert_frame_equal(num_pf, tref_num_pf) + + +def test_apply_pre_bm_hypothesis_filters_conditional_intensity(): + transition_table = pd.DataFrame( + { + "feature_id": [1, 1, 1, 2, 2, 3, 3, 3], + "peptide_id": [10, -1, 20, 30, -1, 40, 40, -1], + "transition_id": [100, 100, 200, 300, 300, 400, 401, 400], + "bmask": [1, 1, 1, 1, 1, 1, 1, 1], + "num_peptidoforms": [1, 1, 1, 1, 1, 1, 1, 1], + "pep": [0.01, 0.05, 0.01, 0.01, 0.05, 0.01, 0.02, 0.05], + "n_mapped_peptides": [1, 1, 1, 1, 1, 1, 1, 1], + "has_phospho_loss": [0, 0, 1, 0, 0, 0, 0, 0], + } + ) + precursor_table = pd.DataFrame( + { + "feature_id": [1, 2, 3], + "feature_ms2_intensity": [500.0, 500.0, 20000.0], + } + ) + + tout = apply_pre_bm_hypothesis_filters( + transition_table, + precursor_table, + conditional_min_peakgroup_intensity=10000.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=1, + conditional_min_peakgroup_intensity_no_phospho_loss_only=True, + ) + + kept_hypotheses = ( + tout.loc[tout["peptide_id"] != -1, ["feature_id", "peptide_id"]] + .drop_duplicates() + .sort_values(["feature_id", "peptide_id"]) + .reset_index(drop=True) + ) + tref_hypotheses = pd.DataFrame({"feature_id": [1, 3], "peptide_id": [20, 40]}) + + assert_frame_equal(kept_hypotheses, tref_hypotheses) + + +def test_apply_post_ipf_filters(): + result = pd.DataFrame( + { + "FEATURE_ID": [1, 1, 2, 3, 4], + "PEPTIDE_ID": [10, 11, 20, 30, 40], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1, 0.2, 0.3, 0.4], + "QVALUE": [0.01, 0.02, 0.03, 0.04, 0.05], + "PEP": [0.001, 0.002, 0.003, 0.004, 0.005], + } + ) + filter_metrics = pd.DataFrame( + { + "feature_id": [1, 1, 2, 3, 4], + "peptide_id": [10, 11, 20, 30, 40], + "supporting_transitions": [1, 2, 2, 3, 1], + "unique_supporting_transitions": [1, 2, 0, 3, 1], + "phospho_loss_supporting_transitions": [1, 0, 1, 0, 0], + "feature_ms2_intensity": [1000.0, 1000.0, 200.0, 50.0, 1000.0], + } + ) + + tout = apply_post_ipf_filters( + result, + filter_metrics, + min_supporting_transitions=1, + min_unique_supporting_transitions=1, + require_phospho_loss_below_support=2, + min_peakgroup_intensity=100.0, + ) + tref = pd.DataFrame( + { + "FEATURE_ID": [1, 1], + "PEPTIDE_ID": [10, 11], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1], + "QVALUE": [0.01, 0.02], + "PEP": [0.001, 0.002], + } + ) + + assert_frame_equal( + tout.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + tref.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + ) + + +def test_apply_post_ipf_filters_conditional_intensity(): + result = pd.DataFrame( + { + "FEATURE_ID": [1, 2, 3, 4], + "PEPTIDE_ID": [10, 20, 30, 40], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1, 0.1, 0.1], + "QVALUE": [0.01, 0.02, 0.03, 0.04], + "PEP": [0.001, 0.002, 0.003, 0.004], + } + ) + filter_metrics = pd.DataFrame( + { + "feature_id": [1, 2, 3, 4], + "peptide_id": [10, 20, 30, 40], + "supporting_transitions": [1, 1, 2, 3], + "unique_supporting_transitions": [1, 1, 2, 3], + "phospho_loss_supporting_transitions": [0, 1, 0, 0], + "feature_ms2_intensity": [5000.0, 5000.0, 5000.0, 5000.0], + } + ) + + tout = apply_post_ipf_filters( + result, + filter_metrics, + conditional_min_peakgroup_intensity=10000.0, + conditional_min_peakgroup_intensity_max_supporting_transitions=2, + conditional_min_peakgroup_intensity_no_phospho_loss_only=True, + ) + tref = pd.DataFrame( + { + "FEATURE_ID": [2, 4], + "PEPTIDE_ID": [20, 40], + "PRECURSOR_PEAKGROUP_PEP": [0.1, 0.1], + "QVALUE": [0.02, 0.04], + "PEP": [0.002, 0.004], + } + ) + + assert_frame_equal( + tout.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + tref.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), + ) + + +def test_apply_ipf_score_adjustment_rewards_high_support_and_intensity(): + result = pd.DataFrame( + { + "feature_id": [1, 2], + "hypothesis": [10, 20], + "pep": [0.01, 0.01], + "qvalue": [0.01, 0.01], + "num_peptidoforms": [2, 2], + } + ) + score_metrics = pd.DataFrame( + { + "feature_id": [1, 2], + "peptide_id": [10, 20], + "supporting_transitions": [1, 4], + "unique_supporting_transitions": [1, 2], + "phospho_loss_supporting_transitions": [0, 1], + "feature_ms2_intensity": [1_000.0, 100_000.0], + } + ) + + tout = apply_ipf_score_adjustment( + result, + score_metrics, + score_support_log_weight=1.0, + score_log_intensity_weight=1.0, + score_intensity_reference=10_000.0, + ) + + pep_low = tout.loc[tout["hypothesis"] == 10, "pep"].iloc[0] + pep_high = tout.loc[tout["hypothesis"] == 20, "pep"].iloc[0] + + assert pep_high < pep_low + assert pep_high < 0.01 + assert pep_low > 0.01 + + +def test_apply_ipf_score_adjustment_conditional_mask_only_hits_low_support_no_loss(): + result = pd.DataFrame( + { + "feature_id": [1, 2, 3], + "hypothesis": [10, 20, 30], + "pep": [0.01, 0.01, 0.01], + "qvalue": [0.01, 0.01, 0.01], + "num_peptidoforms": [2, 2, 2], + } + ) + score_metrics = pd.DataFrame( + { + "feature_id": [1, 2, 3], + "peptide_id": [10, 20, 30], + "supporting_transitions": [1, 3, 1], + "unique_supporting_transitions": [1, 2, 1], + "phospho_loss_supporting_transitions": [0, 0, 1], + "feature_ms2_intensity": [1_000.0, 1_000.0, 1_000.0], + } + ) + + tout = apply_ipf_score_adjustment( + result, + score_metrics, + score_log_intensity_weight=1.0, + score_intensity_reference=10_000.0, + score_adjust_max_supporting_transitions=1, + score_adjust_no_phospho_loss_only=True, + ) + + pep_row_1 = tout.loc[tout["hypothesis"] == 10, "pep"].iloc[0] + pep_row_2 = tout.loc[tout["hypothesis"] == 20, "pep"].iloc[0] + pep_row_3 = tout.loc[tout["hypothesis"] == 30, "pep"].iloc[0] + + assert pep_row_1 > 0.01 + assert np.isclose(pep_row_2, 0.01) + assert np.isclose(pep_row_3, 0.01) + + +def test_apply_ipf_score_adjustment_rewards_unique_and_phospho_loss_and_penalizes_overlap(): + result = pd.DataFrame( + { + "feature_id": [1, 2, 3], + "hypothesis": [10, 20, 30], + "pep": [0.01, 0.01, 0.01], + "qvalue": [0.01, 0.01, 0.01], + "num_peptidoforms": [2, 2, 2], + } + ) + score_metrics = pd.DataFrame( + { + "feature_id": [1, 2, 3], + "peptide_id": [10, 20, 30], + "supporting_transitions": [2, 2, 2], + "unique_supporting_transitions": [0, 2, 2], + "phospho_loss_supporting_transitions": [0, 1, 1], + "median_supporting_isotope_overlap": [0.02, 0.02, 0.08], + "feature_ms2_intensity": [10000.0, 10000.0, 10000.0], + } + ) + + tout = apply_ipf_score_adjustment( + result, + score_metrics, + score_unique_support_log_weight=0.5, + score_phospho_loss_bonus=0.5, + score_overlap_penalty_weight=20.0, + score_overlap_reference=0.02, + ) + + pep_base = tout.loc[tout["hypothesis"] == 10, "pep"].iloc[0] + pep_good = tout.loc[tout["hypothesis"] == 20, "pep"].iloc[0] + pep_overlap = tout.loc[tout["hypothesis"] == 30, "pep"].iloc[0] + + assert pep_good < pep_base + assert pep_overlap > pep_good + + +def test_apply_transition_evidence_adjustment_shrinks_noninformative_pep_logits(): + tin = pd.DataFrame( + { + "feature_id": [1, 1, 1], + "transition_id": [100, 101, 102], + "pep": [0.01, 0.01, 0.01], + "n_mapped_peptides": [1, 2, 1], + "has_phospho_loss": [1, 0, 1], + "isotope_overlap_score": [0.01, 0.01, 0.08], + } + ) + + tout = apply_transition_evidence_adjustment( + tin, + transition_evidence_nonunique_logit_scale=0.5, + transition_evidence_no_phospho_loss_logit_scale=0.5, + transition_evidence_overlap_logit_scale_weight=20.0, + transition_evidence_overlap_reference=0.02, + ) + + pep_unique_loss = tout.loc[tout["transition_id"] == 100, "pep"].iloc[0] + pep_nonunique_noloss = tout.loc[tout["transition_id"] == 101, "pep"].iloc[0] + pep_high_overlap = tout.loc[tout["transition_id"] == 102, "pep"].iloc[0] + + assert np.isclose(pep_unique_loss, 0.01) + assert pep_nonunique_noloss > pep_unique_loss + assert pep_high_overlap > pep_unique_loss + + +def test_apply_transition_evidence_adjustment_filters_high_overlap_rows(): + tin = pd.DataFrame( + { + "feature_id": [1, 1], + "transition_id": [100, 101], + "pep": [0.01, 0.01], + "n_mapped_peptides": [1, 1], + "has_phospho_loss": [1, 1], + "isotope_overlap_score": [0.02, 0.08], + } + ) + + tout = apply_transition_evidence_adjustment( + tin, + transition_evidence_max_isotope_overlap=0.05, + ) + + assert_frame_equal( + tout.reset_index(drop=True), + tin.loc[tin["transition_id"] == 100].reset_index(drop=True), + ) From 052c588f75dc4babd7905ef169b0b5cb9156e83e Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 17 Jun 2026 20:10:59 -0400 Subject: [PATCH 2/5] refactor: simplify grouped FDR strategy by removing phospho-loss option and related checks --- pyprophet/_config.py | 6 +- pyprophet/cli/ipf.py | 4 +- pyprophet/io/ipf/osw.py | 62 +--- pyprophet/io/ipf/parquet.py | 9 - pyprophet/io/ipf/split_parquet.py | 13 - pyprophet/ipf.py | 497 +----------------------------- 6 files changed, 15 insertions(+), 576 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 0296b78f..3439fca3 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -456,7 +456,7 @@ class IPFIOConfig(BaseIOConfig): ipf_ms2_scoring (bool): Use MS2 precursor data for IPF. ipf_h0 (bool): Include possibility that peak groups are not covered by the peptidoform space (null hypothesis H0). ipf_grouped_fdr (bool): [Experimental] Compute grouped FDR instead of pooled FDR to support heterogeneous peptidoform counts per peak group. - ipf_grouped_fdr_strategy (Literal["num_peptidoforms", "support_phospho_loss"]): Grouping strategy used when grouped FDR is enabled. + ipf_grouped_fdr_strategy (Literal["num_peptidoforms"]): Grouping strategy used when grouped FDR is enabled. ipf_max_precursor_pep (float): Maximum PEP to consider scored precursors in IPF. ipf_max_peakgroup_pep (float): Maximum PEP to consider scored peak groups in IPF. ipf_max_precursor_peakgroup_pep (float): Maximum BHM layer 1 integrated precursor-peakgroup PEP to consider in IPF. @@ -474,9 +474,7 @@ class IPFIOConfig(BaseIOConfig): ipf_ms2_scoring: bool = True ipf_h0: bool = True ipf_grouped_fdr: bool = False - ipf_grouped_fdr_strategy: Literal[ - "num_peptidoforms", "support_phospho_loss" - ] = "num_peptidoforms" + ipf_grouped_fdr_strategy: Literal["num_peptidoforms"] = "num_peptidoforms" ipf_max_precursor_pep: float = 0.7 ipf_max_peakgroup_pep: float = 0.7 ipf_max_precursor_peakgroup_pep: float = 0.4 diff --git a/pyprophet/cli/ipf.py b/pyprophet/cli/ipf.py index d64af7b8..26cd4b1e 100644 --- a/pyprophet/cli/ipf.py +++ b/pyprophet/cli/ipf.py @@ -52,8 +52,8 @@ "--ipf_grouped_fdr_strategy", default="num_peptidoforms", show_default=True, - type=click.Choice(["num_peptidoforms", "support_phospho_loss"]), - help="Grouping strategy used when --ipf_grouped_fdr is enabled. 'num_peptidoforms' reproduces the legacy grouping. 'support_phospho_loss' groups by supporting-transition bin (<=1, 2, >=3) and presence of phospho-loss support.", + type=click.Choice(["num_peptidoforms"]), + help="Grouping strategy used when --ipf_grouped_fdr is enabled.", ) @click.option( "--ipf_max_precursor_pep", diff --git a/pyprophet/io/ipf/osw.py b/pyprophet/io/ipf/osw.py index 424561bf..bb181007 100644 --- a/pyprophet/io/ipf/osw.py +++ b/pyprophet/io/ipf/osw.py @@ -243,17 +243,7 @@ def _read_pyp_transition_duckdb(self, con): rc = self.config ipf_h0 = rc.ipf_h0 pep_threshold = rc.ipf_max_transition_pep - require_overlap = False - require_phospho_loss = ( - rc.ipf_grouped_fdr - and rc.ipf_grouped_fdr_strategy == "support_phospho_loss" - ) has_annotation = "ANNOTATION" in get_table_columns(self.infile, "TRANSITION") - - if require_phospho_loss and not has_annotation: - raise click.ClickException( - "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." - ) phospho_loss_expr = ( "CASE WHEN COALESCE(TRANSITION.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END" if has_annotation @@ -263,28 +253,11 @@ def _read_pyp_transition_duckdb(self, con): # only the evidence is restricted to ipf_max_transition_pep, the peptidoform-space is complete logger.info("Info: Reading peptidoform-level data ...") - overlap_join = "" - overlap_select = "" - if require_overlap: - if not check_duckdb_table(con, "main", "FEATURE_TRANSITION"): - raise click.ClickException( - "FEATURE_TRANSITION is required for overlap-aware IPF score adjustment." - ) - overlap_join = ( - "INNER JOIN osw.FEATURE_TRANSITION " - "ON SCORE_TRANSITION.FEATURE_ID = FEATURE_TRANSITION.FEATURE_ID " - "AND SCORE_TRANSITION.TRANSITION_ID = FEATURE_TRANSITION.TRANSITION_ID" - ) - overlap_select = ( - ", FEATURE_TRANSITION.VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" - ) - queries = { "evidence": f""" - SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP{overlap_select} + SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP FROM osw.SCORE_TRANSITION INNER JOIN osw.TRANSITION ON SCORE_TRANSITION.TRANSITION_ID = TRANSITION.ID - {overlap_join} WHERE TRANSITION.TYPE != '' AND TRANSITION.DECOY = 0 AND PEP < {pep_threshold} @@ -612,17 +585,7 @@ def _read_pyp_transition_sqlite(self, con): rc = self.config ipf_h0 = rc.ipf_h0 pep_threshold = rc.ipf_max_transition_pep - require_overlap = False - require_phospho_loss = ( - rc.ipf_grouped_fdr - and rc.ipf_grouped_fdr_strategy == "support_phospho_loss" - ) has_annotation = "ANNOTATION" in get_table_columns(self.infile, "TRANSITION") - - if require_phospho_loss and not has_annotation: - raise click.ClickException( - "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." - ) phospho_loss_expr = ( "CASE WHEN COALESCE(TRANSITION.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END" if has_annotation @@ -632,28 +595,11 @@ def _read_pyp_transition_sqlite(self, con): # only the evidence is restricted to ipf_max_transition_pep, the peptidoform-space is complete logger.info("Info: Reading peptidoform-level data ...") - overlap_join = "" - overlap_select = "" - if require_overlap: - if not check_sqlite_table(con, "FEATURE_TRANSITION"): - raise click.ClickException( - "FEATURE_TRANSITION is required for overlap-aware IPF score adjustment." - ) - overlap_join = ( - "INNER JOIN FEATURE_TRANSITION " - "ON SCORE_TRANSITION.FEATURE_ID = FEATURE_TRANSITION.FEATURE_ID " - "AND SCORE_TRANSITION.TRANSITION_ID = FEATURE_TRANSITION.TRANSITION_ID" - ) - overlap_select = ( - ", FEATURE_TRANSITION.VAR_ISOTOPE_OVERLAP_SCORE AS ISOTOPE_OVERLAP_SCORE" - ) - queries = { "evidence": """ - SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP{overlap_select} + SELECT SCORE_TRANSITION.FEATURE_ID, SCORE_TRANSITION.TRANSITION_ID, PEP FROM SCORE_TRANSITION INNER JOIN TRANSITION ON SCORE_TRANSITION.TRANSITION_ID = TRANSITION.ID - {overlap_join} WHERE TRANSITION.TYPE != '' AND TRANSITION.DECOY = 0 AND PEP < ? @@ -704,9 +650,7 @@ def _read_pyp_transition_sqlite(self, con): # Execute evidence = pd.read_sql_query( - queries["evidence"].format( - overlap_select=overlap_select, overlap_join=overlap_join - ), + queries["evidence"], con, params=[pep_threshold], ).rename(columns=str.lower) diff --git a/pyprophet/io/ipf/parquet.py b/pyprophet/io/ipf/parquet.py index a00ec449..87912f0c 100644 --- a/pyprophet/io/ipf/parquet.py +++ b/pyprophet/io/ipf/parquet.py @@ -164,11 +164,6 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: cfg = self.config ipf_h0 = cfg.ipf_h0 pep_threshold = cfg.ipf_max_transition_pep - require_phospho_loss = ( - cfg.ipf_grouped_fdr - and cfg.ipf_grouped_fdr_strategy == "support_phospho_loss" - ) - require_overlap = False logger.info("Reading peptidoform-level data ...") @@ -179,10 +174,6 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: "IPF_PEPTIDE_ID column is required in transition features." ) has_annotation = "ANNOTATION" in all_cols - if require_phospho_loss and not has_annotation: - raise click.ClickException( - "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." - ) has_overlap_col = "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE" in all_cols has_phospho_loss_sql = ( "MAX(CASE WHEN COALESCE(ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END) AS HAS_PHOSPHO_LOSS" diff --git a/pyprophet/io/ipf/split_parquet.py b/pyprophet/io/ipf/split_parquet.py index a93a91a5..93acab58 100644 --- a/pyprophet/io/ipf/split_parquet.py +++ b/pyprophet/io/ipf/split_parquet.py @@ -176,11 +176,6 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: cfg = self.config ipf_h0 = cfg.ipf_h0 pep_threshold = cfg.ipf_max_transition_pep - require_phospho_loss = ( - cfg.ipf_grouped_fdr - and cfg.ipf_grouped_fdr_strategy == "support_phospho_loss" - ) - require_overlap = False logger.info("Reading peptidoform-level data ...") @@ -205,17 +200,9 @@ def _read_pyp_transition(self, con) -> pd.DataFrame: "IPF_PEPTIDE_ID column is required in transition features." ) has_annotation = "ANNOTATION" in all_transition_cols - if require_phospho_loss and not has_annotation: - raise click.ClickException( - "ANNOTATION column is required for grouped FDR with strategy support_phospho_loss." - ) has_overlap_col = ( "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE" in all_transition_cols ) - if require_overlap and not has_overlap_col: - raise click.ClickException( - "FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE is required for overlap-aware IPF score adjustment." - ) has_phospho_loss_sql = ( "MAX(CASE WHEN COALESCE(t.ANNOTATION, '') LIKE '%-H3O4P1%' THEN 1 ELSE 0 END) AS HAS_PHOSPHO_LOSS" if has_annotation diff --git a/pyprophet/ipf.py b/pyprophet/ipf.py index 62d5c2c5..da3b9878 100644 --- a/pyprophet/ipf.py +++ b/pyprophet/ipf.py @@ -95,101 +95,21 @@ def compute_grouped_model_fdr(data_in, group_keys, log_prefix="Grouped FDR"): return qvalues -def _support_phospho_loss_group_keys(metrics): - """ - Build coarse FDR strata from support strength and phospho-loss support. - - Support is grouped into <=1, 2, and >=3 supporting transitions. Each support - stratum is then split by whether at least one phospho-loss supporting - transition is present. - """ - supporting = metrics["supporting_transitions"].fillna(0).astype(int).to_numpy() - phospho_loss = ( - metrics["phospho_loss_supporting_transitions"].fillna(0).astype(int).to_numpy() - ) - - support_labels = np.where( - supporting <= 1, - "support_le1", - np.where(supporting == 2, "support_eq2", "support_ge3"), - ) - phospho_labels = np.where(phospho_loss > 0, "phloss", "no_phloss") - return pd.Series( - support_labels + "__" + phospho_labels, - index=metrics.index, - dtype="object", - ) - - -def compute_ipf_qvalues( - pf_pp_data, - grouped_fdr=False, - grouped_fdr_strategy="num_peptidoforms", - grouping_metrics=None, -): +def compute_ipf_qvalues(pf_pp_data, grouped_fdr=False): """ Compute IPF q-values using pooled or grouped model-based FDR. - Grouped FDR currently supports: - - ``num_peptidoforms``: the legacy grouping by per-feature peptidoform count - - ``support_phospho_loss``: grouping by supporting-transition bin and - phospho-loss support + Grouped FDR groups rows by the per-feature peptidoform count. """ if not grouped_fdr: return compute_model_fdr(pf_pp_data["pep"]) - if grouped_fdr_strategy == "num_peptidoforms": - if "num_peptidoforms" not in pf_pp_data.columns: - raise ValueError( - "num_peptidoforms is required for grouped FDR with strategy 'num_peptidoforms'." - ) - return compute_grouped_model_fdr( - pf_pp_data["pep"], - pf_pp_data["num_peptidoforms"].fillna(-1).astype(int), - log_prefix="Grouped FDR by num_peptidoforms", - ) - - if grouped_fdr_strategy == "support_phospho_loss": - if grouping_metrics is None: - raise ValueError( - "grouping_metrics is required for grouped FDR with strategy 'support_phospho_loss'." - ) - required_cols = { - "feature_id", - "peptide_id", - "supporting_transitions", - "phospho_loss_supporting_transitions", - } - missing = required_cols - set(grouping_metrics.columns) - if missing: - raise ValueError( - "grouping_metrics is missing required columns for grouped FDR " - f"strategy 'support_phospho_loss': {sorted(missing)}" - ) - - merged = pf_pp_data.merge( - grouping_metrics[ - [ - "feature_id", - "peptide_id", - "supporting_transitions", - "phospho_loss_supporting_transitions", - ] - ].drop_duplicates(), - left_on=["feature_id", "hypothesis"], - right_on=["feature_id", "peptide_id"], - how="left", - ) - group_keys = _support_phospho_loss_group_keys(merged) - return compute_grouped_model_fdr( - merged["pep"], - group_keys, - log_prefix="Grouped FDR by support/phospho-loss", - ) - - raise ValueError( - "Unsupported ipf_grouped_fdr_strategy: " - f"{grouped_fdr_strategy!r}. Expected 'num_peptidoforms' or 'support_phospho_loss'." + if "num_peptidoforms" not in pf_pp_data.columns: + raise ValueError("num_peptidoforms is required for grouped FDR.") + return compute_grouped_model_fdr( + pf_pp_data["pep"], + pf_pp_data["num_peptidoforms"].fillna(-1).astype(int), + log_prefix="Grouped FDR by num_peptidoforms", ) @@ -410,300 +330,6 @@ def _ipf_filters_active( ) -def _ipf_score_adjustment_active( - score_support_log_weight=0.0, - score_unique_support_log_weight=0.0, - score_log_intensity_weight=0.0, - score_phospho_loss_bonus=0.0, - score_overlap_penalty_weight=0.0, -): - return ( - score_support_log_weight != 0.0 - or score_unique_support_log_weight != 0.0 - or score_log_intensity_weight != 0.0 - or score_phospho_loss_bonus != 0.0 - or score_overlap_penalty_weight != 0.0 - ) - - -def _transition_evidence_adjustment_active( - transition_evidence_nonunique_logit_scale=1.0, - transition_evidence_no_phospho_loss_logit_scale=1.0, - transition_evidence_overlap_logit_scale_weight=0.0, - transition_evidence_max_isotope_overlap=0.0, -): - return ( - transition_evidence_nonunique_logit_scale != 1.0 - or transition_evidence_no_phospho_loss_logit_scale != 1.0 - or transition_evidence_overlap_logit_scale_weight != 0.0 - or transition_evidence_max_isotope_overlap > 0.0 - ) - - -def apply_transition_evidence_adjustment( - transition_table, - transition_evidence_nonunique_logit_scale=1.0, - transition_evidence_no_phospho_loss_logit_scale=1.0, - transition_evidence_overlap_logit_scale_weight=0.0, - transition_evidence_overlap_reference=0.02, - transition_evidence_max_isotope_overlap=0.0, -): - """ - Adjust transition-level PEPs before BM so weak/non-informative identifying transitions - contribute less extreme evidence. - - The adjustment shrinks the transition PEP logit toward zero, which weakens both supporting - and opposing evidence symmetrically once the BM evidence is constructed from PEP and BMASK. - Optionally, high-overlap transitions can be removed entirely before BM. - """ - if not _transition_evidence_adjustment_active( - transition_evidence_nonunique_logit_scale=transition_evidence_nonunique_logit_scale, - transition_evidence_no_phospho_loss_logit_scale=transition_evidence_no_phospho_loss_logit_scale, - transition_evidence_overlap_logit_scale_weight=transition_evidence_overlap_logit_scale_weight, - transition_evidence_max_isotope_overlap=transition_evidence_max_isotope_overlap, - ): - return transition_table - - if transition_evidence_nonunique_logit_scale <= 0: - raise ValueError("transition_evidence_nonunique_logit_scale must be > 0.") - if transition_evidence_no_phospho_loss_logit_scale <= 0: - raise ValueError( - "transition_evidence_no_phospho_loss_logit_scale must be > 0." - ) - if transition_evidence_overlap_logit_scale_weight < 0: - raise ValueError( - "transition_evidence_overlap_logit_scale_weight must be >= 0." - ) - if transition_evidence_overlap_reference < 0: - raise ValueError("transition_evidence_overlap_reference must be >= 0.") - if transition_evidence_max_isotope_overlap < 0: - raise ValueError("transition_evidence_max_isotope_overlap must be >= 0.") - - adjusted = transition_table.copy() - - if transition_evidence_max_isotope_overlap > 0: - if "isotope_overlap_score" not in adjusted.columns: - raise ValueError( - "isotope_overlap_score is required for transition_evidence_max_isotope_overlap filtering." - ) - before = len(adjusted) - adjusted = adjusted[ - adjusted["isotope_overlap_score"].fillna(transition_evidence_overlap_reference) - <= transition_evidence_max_isotope_overlap - ].copy() - logger.info( - "Applied pre-BM transition overlap filter: " - f"kept {len(adjusted)}/{before} transition rows with isotope_overlap_score <= " - f"{transition_evidence_max_isotope_overlap}." - ) - - if "pep" not in adjusted.columns: - raise ValueError("Transition table must contain pep for evidence adjustment.") - - scales = np.ones(len(adjusted), dtype=float) - - if transition_evidence_nonunique_logit_scale != 1.0: - if "n_mapped_peptides" not in adjusted.columns: - raise ValueError( - "n_mapped_peptides is required for transition_evidence_nonunique_logit_scale." - ) - scales *= np.where( - adjusted["n_mapped_peptides"].fillna(0).astype(int).to_numpy() > 1, - transition_evidence_nonunique_logit_scale, - 1.0, - ) - - if transition_evidence_no_phospho_loss_logit_scale != 1.0: - if "has_phospho_loss" not in adjusted.columns: - raise ValueError( - "has_phospho_loss is required for transition_evidence_no_phospho_loss_logit_scale." - ) - scales *= np.where( - adjusted["has_phospho_loss"].fillna(0).astype(int).to_numpy() > 0, - 1.0, - transition_evidence_no_phospho_loss_logit_scale, - ) - - if transition_evidence_overlap_logit_scale_weight != 0.0: - if "isotope_overlap_score" not in adjusted.columns: - raise ValueError( - "isotope_overlap_score is required for transition_evidence_overlap_logit_scale_weight." - ) - overlap = ( - adjusted["isotope_overlap_score"] - .fillna(transition_evidence_overlap_reference) - .astype(float) - .clip(lower=0.0) - .to_numpy() - ) - scales *= np.exp( - -transition_evidence_overlap_logit_scale_weight - * np.maximum(0.0, overlap - transition_evidence_overlap_reference) - ) - - eps = np.finfo(float).eps - pep = adjusted["pep"].astype(float).clip(lower=eps, upper=1 - eps).to_numpy() - pep_logit = np.log(pep / (1 - pep)) - adjusted["pep"] = expit(pep_logit * scales) - - logger.info( - "Applied pre-BM transition evidence shaping: " - f"nonunique_logit_scale={transition_evidence_nonunique_logit_scale}, " - f"no_phospho_loss_logit_scale={transition_evidence_no_phospho_loss_logit_scale}, " - f"overlap_logit_scale_weight={transition_evidence_overlap_logit_scale_weight}, " - f"overlap_reference={transition_evidence_overlap_reference}, " - f"max_isotope_overlap={transition_evidence_max_isotope_overlap}. " - f"Adjusted {len(adjusted)} transition rows." - ) - - return adjusted - - -def apply_ipf_score_adjustment( - result, - score_metrics, - ipf_grouped_fdr=False, - score_support_log_weight=0.0, - score_unique_support_log_weight=0.0, - score_log_intensity_weight=0.0, - score_phospho_loss_bonus=0.0, - score_intensity_reference=10000.0, - score_overlap_penalty_weight=0.0, - score_overlap_reference=0.02, - score_adjust_max_supporting_transitions=0, - score_adjust_no_phospho_loss_only=False, -): - """ - Experimentally re-rank IPF peptidoform hypotheses by adjusting the PEP in logit space - using support-count and feature-intensity signals, then recomputing q-values. - - Positive weights improve hypotheses with stronger support / higher intensity. When - score_adjust_max_supporting_transitions is set, the adjustment only applies to weak- - support hypotheses at or below that threshold. When score_adjust_no_phospho_loss_only - is enabled, only hypotheses without phospho-loss support are adjusted. - """ - if not _ipf_score_adjustment_active( - score_support_log_weight=score_support_log_weight, - score_unique_support_log_weight=score_unique_support_log_weight, - score_log_intensity_weight=score_log_intensity_weight, - score_phospho_loss_bonus=score_phospho_loss_bonus, - score_overlap_penalty_weight=score_overlap_penalty_weight, - ): - return result - - if score_intensity_reference <= 0: - raise ValueError("score_intensity_reference must be > 0.") - if score_overlap_reference < 0: - raise ValueError("score_overlap_reference must be >= 0.") - - merged = result.merge( - score_metrics, - left_on=["feature_id", "hypothesis"], - right_on=["feature_id", "peptide_id"], - how="left", - ) - - for metric_col in [ - "supporting_transitions", - "unique_supporting_transitions", - "phospho_loss_supporting_transitions", - ]: - if metric_col not in merged.columns: - merged[metric_col] = 0 - merged[metric_col] = merged[metric_col].fillna(0).astype(int) - - if score_log_intensity_weight != 0.0 and "feature_ms2_intensity" not in merged.columns: - raise ValueError( - "feature_ms2_intensity is required for support/intensity-aware IPF score adjustment." - ) - if score_overlap_penalty_weight != 0.0 and "median_supporting_isotope_overlap" not in merged.columns: - raise ValueError( - "median_supporting_isotope_overlap is required for overlap-aware IPF score adjustment." - ) - - apply_mask = merged["hypothesis"] != -1 - if score_adjust_max_supporting_transitions > 0: - apply_mask &= ( - merged["supporting_transitions"] <= score_adjust_max_supporting_transitions - ) - if score_adjust_no_phospho_loss_only: - apply_mask &= merged["phospho_loss_supporting_transitions"] == 0 - - score_bonus = np.zeros(len(merged), dtype=float) - if score_support_log_weight != 0.0: - score_bonus += score_support_log_weight * np.log1p( - merged["supporting_transitions"].astype(float).to_numpy() - ) - if score_unique_support_log_weight != 0.0: - score_bonus += score_unique_support_log_weight * np.log1p( - merged["unique_supporting_transitions"].astype(float).to_numpy() - ) - if score_log_intensity_weight != 0.0: - intensity = ( - merged["feature_ms2_intensity"] - .fillna(0.0) - .astype(float) - .clip(lower=1.0) - .to_numpy() - ) - score_bonus += score_log_intensity_weight * ( - np.log10(intensity) - np.log10(score_intensity_reference) - ) - if score_phospho_loss_bonus != 0.0: - score_bonus += score_phospho_loss_bonus * ( - merged["phospho_loss_supporting_transitions"].astype(float).to_numpy() > 0 - ) - if score_overlap_penalty_weight != 0.0: - overlap = ( - merged["median_supporting_isotope_overlap"] - .fillna(score_overlap_reference) - .astype(float) - .clip(lower=0.0) - .to_numpy() - ) - score_bonus -= score_overlap_penalty_weight * np.maximum( - 0.0, overlap - score_overlap_reference - ) - - score_bonus = np.where(apply_mask.to_numpy(), score_bonus, 0.0) - - eps = np.finfo(float).eps - pep = merged["pep"].astype(float).clip(lower=eps, upper=1 - eps).to_numpy() - pep_logit = np.log(pep / (1 - pep)) - adjusted_pep = expit(pep_logit - score_bonus) - adjusted_pep = np.clip(adjusted_pep, eps, 1 - eps) - - adjusted = merged.copy() - adjusted["pep"] = adjusted_pep - - if ipf_grouped_fdr: - if "num_peptidoforms" not in adjusted.columns: - raise ValueError( - "num_peptidoforms is required for grouped FDR after IPF score adjustment." - ) - adjusted["qvalue"] = adjusted.groupby("num_peptidoforms")["pep"].transform( - compute_model_fdr - ) - else: - adjusted["qvalue"] = compute_model_fdr(adjusted["pep"]) - - logger.info( - "Applied support/intensity-aware IPF score adjustment: " - f"support_log_weight={score_support_log_weight}, " - f"unique_support_log_weight={score_unique_support_log_weight}, " - f"log_intensity_weight={score_log_intensity_weight}, " - f"phospho_loss_bonus={score_phospho_loss_bonus}, " - f"intensity_reference={score_intensity_reference}, " - f"overlap_penalty_weight={score_overlap_penalty_weight}, " - f"overlap_reference={score_overlap_reference}, " - f"max_supporting_transitions={score_adjust_max_supporting_transitions}, " - f"no_phospho_loss_only={score_adjust_no_phospho_loss_only}. " - f"Adjusted {int(apply_mask.sum())}/{len(adjusted)} hypothesis rows." - ) - - return adjusted - - def _apply_ipf_filter_thresholds( metrics, min_supporting_transitions=0, @@ -820,100 +446,6 @@ def _apply_ipf_filter_thresholds( return filtered -def apply_pre_bm_hypothesis_filters( - transition_table, - precursor_table, - propagate_signal_across_runs=False, - across_run_confidence_threshold=0.5, - min_supporting_transitions=0, - min_unique_supporting_transitions=0, - require_phospho_loss_below_support=0, - min_peakgroup_intensity=0.0, - conditional_min_peakgroup_intensity=0.0, - conditional_min_peakgroup_intensity_max_supporting_transitions=0, - conditional_min_peakgroup_intensity_no_phospho_loss_only=False, -): - """ - Applies optional structural hypothesis filters before transition-level Bayesian modeling. - - The same feature/peptidoform metrics used for post-IPF filtering are computed on the - current evidence state and used to prune weak hypotheses before posterior inference. - Retained rows have their num_peptidoforms recomputed so priors remain coherent. - - Args: - transition_table (pd.DataFrame): Transition-level peptidoform table. - precursor_table (pd.DataFrame): Peakgroup / precursor-level table. - propagate_signal_across_runs (bool): Whether IPF propagates evidence across runs. - across_run_confidence_threshold (float): Confidence threshold for signal propagation. - min_supporting_transitions (int): Minimum supporting transitions required. - min_unique_supporting_transitions (int): Minimum uniquely supporting transitions required. - require_phospho_loss_below_support (int): Require at least one phospho-loss supporting - transition when supporting_transitions is below this threshold. - min_peakgroup_intensity (float): Minimum MS2 feature intensity required. - - Returns: - pd.DataFrame: Filtered transition table ready for peptidoform inference. - """ - if not _ipf_filters_active( - min_supporting_transitions=min_supporting_transitions, - min_unique_supporting_transitions=min_unique_supporting_transitions, - require_phospho_loss_below_support=require_phospho_loss_below_support, - min_peakgroup_intensity=min_peakgroup_intensity, - conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, - ): - return transition_table - - filter_metrics = prepare_post_ipf_filter_metrics( - transition_table, - precursor_table, - propagate_signal_across_runs=propagate_signal_across_runs, - across_run_confidence_threshold=across_run_confidence_threshold, - ) - kept_hypotheses = _apply_ipf_filter_thresholds( - filter_metrics, - min_supporting_transitions=min_supporting_transitions, - min_unique_supporting_transitions=min_unique_supporting_transitions, - require_phospho_loss_below_support=require_phospho_loss_below_support, - min_peakgroup_intensity=min_peakgroup_intensity, - conditional_min_peakgroup_intensity=conditional_min_peakgroup_intensity, - conditional_min_peakgroup_intensity_max_supporting_transitions=conditional_min_peakgroup_intensity_max_supporting_transitions, - conditional_min_peakgroup_intensity_no_phospho_loss_only=conditional_min_peakgroup_intensity_no_phospho_loss_only, - log_prefix="Applied pre-BM", - )[["feature_id", "peptide_id"]].drop_duplicates() - - filtered = transition_table.merge( - kept_hypotheses.assign(_keep_hypothesis=True), - on=["feature_id", "peptide_id"], - how="left", - ) - filtered = filtered[ - (filtered["peptide_id"] == -1) | (filtered["_keep_hypothesis"] == True) - ].copy() - filtered = filtered.drop(columns=["_keep_hypothesis"]) - - num_peptidoforms = ( - filtered.loc[filtered["peptide_id"] != -1] - .groupby("feature_id")["peptide_id"] - .nunique() - .rename("num_peptidoforms") - ) - filtered = filtered.drop(columns=["num_peptidoforms"], errors="ignore").merge( - num_peptidoforms, - on="feature_id", - how="left", - ) - filtered["num_peptidoforms"] = filtered["num_peptidoforms"].fillna(0).astype(int) - - logger.info( - "Applied pre-BM hypothesis filtering: " - f"kept {len(filtered)} transition rows spanning " - f"{filtered.loc[filtered['peptide_id'] != -1, ['feature_id', 'peptide_id']].drop_duplicates().shape[0]} " - "non-H0 feature-hypothesis candidates." - ) - - return filtered - - def apply_post_ipf_filters( result, filter_metrics, @@ -1365,10 +897,8 @@ def peptidoform_inference( transition_table, precursor_data, ipf_grouped_fdr, - ipf_grouped_fdr_strategy, propagate_signal_across_runs, across_run_confidence_threshold, - grouping_metrics=None, ): """ Conducts peptidoform-level inference. @@ -1377,11 +907,8 @@ def peptidoform_inference( transition_table (pd.DataFrame): Input data containing transition-level information. precursor_data (pd.DataFrame): Precursor-level probabilities. ipf_grouped_fdr (bool): Whether to use grouped FDR estimation. - ipf_grouped_fdr_strategy (str): Grouping strategy when grouped FDR is enabled. propagate_signal_across_runs (bool): Whether to propagate signal across runs. across_run_confidence_threshold (float): Confidence threshold for propagation. - grouping_metrics (pd.DataFrame | None): Optional per-feature/per-hypothesis - metrics required by grouped FDR strategies beyond ``num_peptidoforms``. Returns: pd.DataFrame: Inferred peptidoform probabilities and FDR estimates. @@ -1409,8 +936,6 @@ def peptidoform_inference( pf_pp_data["qvalue"] = compute_ipf_qvalues( pf_pp_data, grouped_fdr=ipf_grouped_fdr, - grouped_fdr_strategy=ipf_grouped_fdr_strategy, - grouping_metrics=grouping_metrics, ) # merge precursor-level data with UIS data @@ -1471,10 +996,6 @@ def infer_peptidoforms(config: IPFIOConfig): need_transition_score_metrics = ( config.ipf_min_supporting_transitions > 0 or config.ipf_min_peakgroup_intensity > 0 - or ( - config.ipf_grouped_fdr - and config.ipf_grouped_fdr_strategy == "support_phospho_loss" - ) ) if need_transition_score_metrics: transition_score_metrics = prepare_post_ipf_filter_metrics( @@ -1488,10 +1009,8 @@ def infer_peptidoforms(config: IPFIOConfig): peptidoform_table, precursor_data, config.ipf_grouped_fdr, - config.ipf_grouped_fdr_strategy, config.propagate_signal_across_runs, config.across_run_confidence_threshold, - grouping_metrics=transition_score_metrics, ) # finalize results and write to table From 9bddfa2ba635f8a3b86f7371f33600fb1b2a5cdf Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 17 Jun 2026 20:12:01 -0400 Subject: [PATCH 3/5] refactor: remove unused pre-BM hypothesis filter tests and related functions --- tests/test_ipf.py | 256 ---------------------------------------------- 1 file changed, 256 deletions(-) diff --git a/tests/test_ipf.py b/tests/test_ipf.py index 783beb45..afb4fa43 100644 --- a/tests/test_ipf.py +++ b/tests/test_ipf.py @@ -9,9 +9,6 @@ from pyprophet.ipf import ( apply_bm, - apply_ipf_score_adjustment, - apply_transition_evidence_adjustment, - apply_pre_bm_hypothesis_filters, apply_post_ipf_filters, compute_model_fdr, compute_post_ipf_filter_metrics, @@ -279,92 +276,6 @@ def test_compute_post_ipf_filter_metrics(): tref.sort_values(["feature_id", "peptide_id"]).reset_index(drop=True), ) - -def test_apply_pre_bm_hypothesis_filters(): - transition_table = pd.DataFrame( - { - "feature_id": [1, 1, 1, 1, 1, 1, 2, 2], - "peptide_id": [10, 10, 11, 11, -1, -1, 20, -1], - "transition_id": [100, 101, 102, 103, 100, 101, 200, 200], - "bmask": [1, 1, 1, 0, 1, 0, 1, 1], - "num_peptidoforms": [2, 2, 2, 2, 2, 2, 1, 1], - "pep": [0.01, 0.02, 0.03, 0.2, 0.05, 0.8, 0.01, 0.05], - "n_mapped_peptides": [1, 1, 1, 1, 1, 1, 1, 1], - "has_phospho_loss": [1, 0, 0, 0, 0, 0, 0, 0], - } - ) - precursor_table = pd.DataFrame( - { - "feature_id": [1, 2], - "feature_ms2_intensity": [1000.0, 50.0], - } - ) - - tout = apply_pre_bm_hypothesis_filters( - transition_table, - precursor_table, - min_supporting_transitions=2, - min_peakgroup_intensity=100.0, - ) - - kept_hypotheses = tout.loc[tout["peptide_id"] != -1, ["feature_id", "peptide_id"]] - kept_hypotheses = kept_hypotheses.drop_duplicates().sort_values( - ["feature_id", "peptide_id"] - ) - tref_hypotheses = pd.DataFrame({"feature_id": [1], "peptide_id": [10]}) - - assert_frame_equal(kept_hypotheses.reset_index(drop=True), tref_hypotheses) - - num_pf = ( - tout[["feature_id", "num_peptidoforms"]] - .drop_duplicates() - .sort_values("feature_id") - .reset_index(drop=True) - ) - tref_num_pf = pd.DataFrame({"feature_id": [1, 2], "num_peptidoforms": [1, 0]}) - - assert_frame_equal(num_pf, tref_num_pf) - - -def test_apply_pre_bm_hypothesis_filters_conditional_intensity(): - transition_table = pd.DataFrame( - { - "feature_id": [1, 1, 1, 2, 2, 3, 3, 3], - "peptide_id": [10, -1, 20, 30, -1, 40, 40, -1], - "transition_id": [100, 100, 200, 300, 300, 400, 401, 400], - "bmask": [1, 1, 1, 1, 1, 1, 1, 1], - "num_peptidoforms": [1, 1, 1, 1, 1, 1, 1, 1], - "pep": [0.01, 0.05, 0.01, 0.01, 0.05, 0.01, 0.02, 0.05], - "n_mapped_peptides": [1, 1, 1, 1, 1, 1, 1, 1], - "has_phospho_loss": [0, 0, 1, 0, 0, 0, 0, 0], - } - ) - precursor_table = pd.DataFrame( - { - "feature_id": [1, 2, 3], - "feature_ms2_intensity": [500.0, 500.0, 20000.0], - } - ) - - tout = apply_pre_bm_hypothesis_filters( - transition_table, - precursor_table, - conditional_min_peakgroup_intensity=10000.0, - conditional_min_peakgroup_intensity_max_supporting_transitions=1, - conditional_min_peakgroup_intensity_no_phospho_loss_only=True, - ) - - kept_hypotheses = ( - tout.loc[tout["peptide_id"] != -1, ["feature_id", "peptide_id"]] - .drop_duplicates() - .sort_values(["feature_id", "peptide_id"]) - .reset_index(drop=True) - ) - tref_hypotheses = pd.DataFrame({"feature_id": [1, 3], "peptide_id": [20, 40]}) - - assert_frame_equal(kept_hypotheses, tref_hypotheses) - - def test_apply_post_ipf_filters(): result = pd.DataFrame( { @@ -452,170 +363,3 @@ def test_apply_post_ipf_filters_conditional_intensity(): tout.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), tref.sort_values(["FEATURE_ID", "PEPTIDE_ID"]).reset_index(drop=True), ) - - -def test_apply_ipf_score_adjustment_rewards_high_support_and_intensity(): - result = pd.DataFrame( - { - "feature_id": [1, 2], - "hypothesis": [10, 20], - "pep": [0.01, 0.01], - "qvalue": [0.01, 0.01], - "num_peptidoforms": [2, 2], - } - ) - score_metrics = pd.DataFrame( - { - "feature_id": [1, 2], - "peptide_id": [10, 20], - "supporting_transitions": [1, 4], - "unique_supporting_transitions": [1, 2], - "phospho_loss_supporting_transitions": [0, 1], - "feature_ms2_intensity": [1_000.0, 100_000.0], - } - ) - - tout = apply_ipf_score_adjustment( - result, - score_metrics, - score_support_log_weight=1.0, - score_log_intensity_weight=1.0, - score_intensity_reference=10_000.0, - ) - - pep_low = tout.loc[tout["hypothesis"] == 10, "pep"].iloc[0] - pep_high = tout.loc[tout["hypothesis"] == 20, "pep"].iloc[0] - - assert pep_high < pep_low - assert pep_high < 0.01 - assert pep_low > 0.01 - - -def test_apply_ipf_score_adjustment_conditional_mask_only_hits_low_support_no_loss(): - result = pd.DataFrame( - { - "feature_id": [1, 2, 3], - "hypothesis": [10, 20, 30], - "pep": [0.01, 0.01, 0.01], - "qvalue": [0.01, 0.01, 0.01], - "num_peptidoforms": [2, 2, 2], - } - ) - score_metrics = pd.DataFrame( - { - "feature_id": [1, 2, 3], - "peptide_id": [10, 20, 30], - "supporting_transitions": [1, 3, 1], - "unique_supporting_transitions": [1, 2, 1], - "phospho_loss_supporting_transitions": [0, 0, 1], - "feature_ms2_intensity": [1_000.0, 1_000.0, 1_000.0], - } - ) - - tout = apply_ipf_score_adjustment( - result, - score_metrics, - score_log_intensity_weight=1.0, - score_intensity_reference=10_000.0, - score_adjust_max_supporting_transitions=1, - score_adjust_no_phospho_loss_only=True, - ) - - pep_row_1 = tout.loc[tout["hypothesis"] == 10, "pep"].iloc[0] - pep_row_2 = tout.loc[tout["hypothesis"] == 20, "pep"].iloc[0] - pep_row_3 = tout.loc[tout["hypothesis"] == 30, "pep"].iloc[0] - - assert pep_row_1 > 0.01 - assert np.isclose(pep_row_2, 0.01) - assert np.isclose(pep_row_3, 0.01) - - -def test_apply_ipf_score_adjustment_rewards_unique_and_phospho_loss_and_penalizes_overlap(): - result = pd.DataFrame( - { - "feature_id": [1, 2, 3], - "hypothesis": [10, 20, 30], - "pep": [0.01, 0.01, 0.01], - "qvalue": [0.01, 0.01, 0.01], - "num_peptidoforms": [2, 2, 2], - } - ) - score_metrics = pd.DataFrame( - { - "feature_id": [1, 2, 3], - "peptide_id": [10, 20, 30], - "supporting_transitions": [2, 2, 2], - "unique_supporting_transitions": [0, 2, 2], - "phospho_loss_supporting_transitions": [0, 1, 1], - "median_supporting_isotope_overlap": [0.02, 0.02, 0.08], - "feature_ms2_intensity": [10000.0, 10000.0, 10000.0], - } - ) - - tout = apply_ipf_score_adjustment( - result, - score_metrics, - score_unique_support_log_weight=0.5, - score_phospho_loss_bonus=0.5, - score_overlap_penalty_weight=20.0, - score_overlap_reference=0.02, - ) - - pep_base = tout.loc[tout["hypothesis"] == 10, "pep"].iloc[0] - pep_good = tout.loc[tout["hypothesis"] == 20, "pep"].iloc[0] - pep_overlap = tout.loc[tout["hypothesis"] == 30, "pep"].iloc[0] - - assert pep_good < pep_base - assert pep_overlap > pep_good - - -def test_apply_transition_evidence_adjustment_shrinks_noninformative_pep_logits(): - tin = pd.DataFrame( - { - "feature_id": [1, 1, 1], - "transition_id": [100, 101, 102], - "pep": [0.01, 0.01, 0.01], - "n_mapped_peptides": [1, 2, 1], - "has_phospho_loss": [1, 0, 1], - "isotope_overlap_score": [0.01, 0.01, 0.08], - } - ) - - tout = apply_transition_evidence_adjustment( - tin, - transition_evidence_nonunique_logit_scale=0.5, - transition_evidence_no_phospho_loss_logit_scale=0.5, - transition_evidence_overlap_logit_scale_weight=20.0, - transition_evidence_overlap_reference=0.02, - ) - - pep_unique_loss = tout.loc[tout["transition_id"] == 100, "pep"].iloc[0] - pep_nonunique_noloss = tout.loc[tout["transition_id"] == 101, "pep"].iloc[0] - pep_high_overlap = tout.loc[tout["transition_id"] == 102, "pep"].iloc[0] - - assert np.isclose(pep_unique_loss, 0.01) - assert pep_nonunique_noloss > pep_unique_loss - assert pep_high_overlap > pep_unique_loss - - -def test_apply_transition_evidence_adjustment_filters_high_overlap_rows(): - tin = pd.DataFrame( - { - "feature_id": [1, 1], - "transition_id": [100, 101], - "pep": [0.01, 0.01], - "n_mapped_peptides": [1, 1], - "has_phospho_loss": [1, 1], - "isotope_overlap_score": [0.02, 0.08], - } - ) - - tout = apply_transition_evidence_adjustment( - tin, - transition_evidence_max_isotope_overlap=0.05, - ) - - assert_frame_equal( - tout.reset_index(drop=True), - tin.loc[tin["transition_id"] == 100].reset_index(drop=True), - ) From dcd2554b034961d94d34ce3c4e3b56849d710239 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 17 Jun 2026 20:12:20 -0400 Subject: [PATCH 4/5] fix: update .gitignore to include tools directory and restore generated API docs --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4f1cc30a..48cb2243 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,6 @@ nosetests.xml # docs docs/_build/* -docs/api/generated/* \ No newline at end of file +docs/api/generated/* + +tools/* \ No newline at end of file From 663ffd81b229a3ce7c03a397be83e41125789737 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 18 Jun 2026 08:40:00 -0400 Subject: [PATCH 5/5] refactor: enhance get_feature_mapping_across_runs function and improve IPF q-value computation with strategy parameter --- pyprophet/glyco/glycoform.py | 141 +++++++++++++++++++++++------------ pyprophet/ipf.py | 29 ++++--- tests/test_pyprophet_ipf.py | 7 +- 3 files changed, 119 insertions(+), 58 deletions(-) diff --git a/pyprophet/glyco/glycoform.py b/pyprophet/glyco/glycoform.py index 96223aa3..3825df65 100644 --- a/pyprophet/glyco/glycoform.py +++ b/pyprophet/glyco/glycoform.py @@ -15,58 +15,105 @@ from .pepmass import GlycoPeptideMassCalculator -def get_feature_mapping_across_runs(infile, min_confidence=0.5): +def get_feature_mapping_across_runs( + infile, max_alignment_pep=0.5, min_mapping_confidence=None +): click.echo("Info: Reading Across Run Feature Alignment Mapping ... ", nl=False) start = time.time() - con = sqlite3.connect(infile) - - query = """ - SELECT - DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, - ALIGNMENT_ID, - FEATURE_ID, - PRECURSOR_ID, - FEATURE_TYPE - FROM ( - SELECT DISTINCT - ALIGNMENT_ID, - PRECURSOR_ID, - REFERENCE_FEATURE_ID AS FEATURE_ID, - 'REFERENCE' AS FEATURE_TYPE - FROM FEATURE_MS2_ALIGNMENT_CANDIDATE - WHERE SELECTED = 1 - AND MAPPING_CONFIDENCE >= ? - AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID - AND ALIGNED_FEATURE_ID != -1 - - UNION - - SELECT DISTINCT - ALIGNMENT_ID, - PRECURSOR_ID, - ALIGNED_FEATURE_ID AS FEATURE_ID, - 'QUERY' AS FEATURE_TYPE - FROM FEATURE_MS2_ALIGNMENT_CANDIDATE - WHERE SELECTED = 1 - AND MAPPING_CONFIDENCE >= ? - AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID - AND ALIGNED_FEATURE_ID != -1 - ) AS feature_list - ORDER BY - ALIGNMENT_GROUP_ID, - CASE FEATURE_TYPE - WHEN 'REFERENCE' THEN 0 - WHEN 'QUERY' THEN 1 - END - """ + with sqlite3.connect(infile) as con: + use_alignment_candidates = ( + min_mapping_confidence is not None + and check_sqlite_table(con, "FEATURE_MS2_ALIGNMENT_CANDIDATE") + ) - data = pd.read_sql_query( - query, con, params=[min_confidence, min_confidence] - ) + if use_alignment_candidates: + query = """ + SELECT + DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, + ALIGNMENT_ID, + FEATURE_ID, + PRECURSOR_ID, + FEATURE_TYPE + FROM ( + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + REFERENCE_FEATURE_ID AS FEATURE_ID, + 'REFERENCE' AS FEATURE_TYPE + FROM FEATURE_MS2_ALIGNMENT_CANDIDATE + WHERE SELECTED = 1 + AND MAPPING_CONFIDENCE >= ? + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + AND ALIGNED_FEATURE_ID != -1 + + UNION + + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + ALIGNED_FEATURE_ID AS FEATURE_ID, + 'QUERY' AS FEATURE_TYPE + FROM FEATURE_MS2_ALIGNMENT_CANDIDATE + WHERE SELECTED = 1 + AND MAPPING_CONFIDENCE >= ? + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + AND ALIGNED_FEATURE_ID != -1 + ) AS feature_list + ORDER BY + ALIGNMENT_GROUP_ID, + CASE FEATURE_TYPE + WHEN 'REFERENCE' THEN 0 + WHEN 'QUERY' THEN 1 + END + """ + data = pd.read_sql_query( + query, + con, + params=[min_mapping_confidence, min_mapping_confidence], + ) + else: + if not check_sqlite_table(con, "FEATURE_MS2_ALIGNMENT") or not check_sqlite_table( + con, "SCORE_ALIGNMENT" + ): + raise click.ClickException( + "Perform feature alignment using ARYCAL, and apply scoring to alignment-level data before running glycoform inference." + ) + + query = f""" + SELECT + DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, + FEATURE_ID + FROM ( + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + REFERENCE_FEATURE_ID AS FEATURE_ID + FROM FEATURE_MS2_ALIGNMENT + WHERE LABEL = 1 + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + + UNION + + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + ALIGNED_FEATURE_ID AS FEATURE_ID + FROM FEATURE_MS2_ALIGNMENT + WHERE LABEL = 1 + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + ) AS feature_list + INNER JOIN ( + SELECT DISTINCT FEATURE_ID + FROM SCORE_ALIGNMENT + WHERE PEP < {max_alignment_pep} + ) AS good_alignments + ON good_alignments.FEATURE_ID = feature_list.FEATURE_ID + ORDER BY ALIGNMENT_GROUP_ID + """ + data = pd.read_sql_query(query, con) data.columns = [col.lower() for col in data.columns] - con.close() end = time.time() click.echo(f"{end-start:.4f} seconds") @@ -634,7 +681,7 @@ def infer_glycoforms( ## prepare for propagating signal across runs for aligned features if propagate_signal_across_runs: across_run_feature_map = get_feature_mapping_across_runs( - infile, max_alignment_pep + infile, max_alignment_pep=max_alignment_pep ) transition_table = pd.merge( transition_table, across_run_feature_map, on="feature_id", how="left" diff --git a/pyprophet/ipf.py b/pyprophet/ipf.py index da3b9878..c02f2d7c 100644 --- a/pyprophet/ipf.py +++ b/pyprophet/ipf.py @@ -30,7 +30,7 @@ import numpy as np import pandas as pd from loguru import logger -from scipy.special import expit, logsumexp +from scipy.special import logsumexp from scipy.stats import rankdata from ._config import IPFIOConfig @@ -95,21 +95,28 @@ def compute_grouped_model_fdr(data_in, group_keys, log_prefix="Grouped FDR"): return qvalues -def compute_ipf_qvalues(pf_pp_data, grouped_fdr=False): +def compute_ipf_qvalues( + pf_pp_data, grouped_fdr=False, grouped_fdr_strategy="num_peptidoforms" +): """ Compute IPF q-values using pooled or grouped model-based FDR. - Grouped FDR groups rows by the per-feature peptidoform count. + Grouped FDR groups rows according to ``grouped_fdr_strategy``. """ if not grouped_fdr: return compute_model_fdr(pf_pp_data["pep"]) - if "num_peptidoforms" not in pf_pp_data.columns: - raise ValueError("num_peptidoforms is required for grouped FDR.") - return compute_grouped_model_fdr( - pf_pp_data["pep"], - pf_pp_data["num_peptidoforms"].fillna(-1).astype(int), - log_prefix="Grouped FDR by num_peptidoforms", + if grouped_fdr_strategy == "num_peptidoforms": + if "num_peptidoforms" not in pf_pp_data.columns: + raise ValueError("num_peptidoforms is required for grouped FDR.") + return compute_grouped_model_fdr( + pf_pp_data["pep"], + pf_pp_data["num_peptidoforms"].fillna(-1).astype(int), + log_prefix="Grouped FDR by num_peptidoforms", + ) + + raise ValueError( + f"Unsupported grouped FDR strategy: {grouped_fdr_strategy!r}" ) @@ -897,6 +904,7 @@ def peptidoform_inference( transition_table, precursor_data, ipf_grouped_fdr, + ipf_grouped_fdr_strategy, propagate_signal_across_runs, across_run_confidence_threshold, ): @@ -907,6 +915,7 @@ def peptidoform_inference( transition_table (pd.DataFrame): Input data containing transition-level information. precursor_data (pd.DataFrame): Precursor-level probabilities. ipf_grouped_fdr (bool): Whether to use grouped FDR estimation. + ipf_grouped_fdr_strategy (str): Grouping strategy used when grouped FDR is enabled. propagate_signal_across_runs (bool): Whether to propagate signal across runs. across_run_confidence_threshold (float): Confidence threshold for propagation. @@ -936,6 +945,7 @@ def peptidoform_inference( pf_pp_data["qvalue"] = compute_ipf_qvalues( pf_pp_data, grouped_fdr=ipf_grouped_fdr, + grouped_fdr_strategy=ipf_grouped_fdr_strategy, ) # merge precursor-level data with UIS data @@ -1009,6 +1019,7 @@ def infer_peptidoforms(config: IPFIOConfig): peptidoform_table, precursor_data, config.ipf_grouped_fdr, + config.ipf_grouped_fdr_strategy, config.propagate_signal_across_runs, config.across_run_confidence_threshold, ) diff --git a/tests/test_pyprophet_ipf.py b/tests/test_pyprophet_ipf.py index 548a6a63..206c2766 100644 --- a/tests/test_pyprophet_ipf.py +++ b/tests/test_pyprophet_ipf.py @@ -132,7 +132,7 @@ def validate_output( for id_col in ["feature_id", "transition_id", "peptide_id"]: if id_col in transition_data.columns: transition_data[id_col] = transition_data[id_col].astype("Int64") - sort_cols = [ + core_transition_cols = [ "feature_id", "transition_id", "pep", @@ -140,9 +140,12 @@ def validate_output( "bmask", "num_peptidoforms", ] + transition_data = transition_data[core_transition_cols] print("Transition Data:", file=regtest) print( - transition_data.sort_values(sort_cols).reset_index(drop=True).head(100), + transition_data.sort_values(core_transition_cols) + .reset_index(drop=True) + .head(100), file=regtest, )