From 89b9ebaae013d5466535275b73c05fb46b6236e3 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 17 Jun 2026 18:47:52 -0400 Subject: [PATCH 1/2] feat: optimize OSW export and merge workflows --- .dockerignore | 4 +- pyprophet/_config.py | 1 + pyprophet/cli/export.py | 9 + pyprophet/cli/merge.py | 9 +- pyprophet/io/export/osw.py | 233 ++++++--- pyprophet/io/util.py | 31 +- pyprophet/util.py | 993 +++++++++++++++++++++++-------------- 7 files changed, 828 insertions(+), 452 deletions(-) diff --git a/.dockerignore b/.dockerignore index e222551c..34844733 100644 --- a/.dockerignore +++ b/.dockerignore @@ -11,9 +11,11 @@ dist/ tests/ notebooks/ data/ +!tools/osw_to_parquet_rust/data/ +!tools/osw_to_parquet_rust/data/unimod.xml examples/ .tmp/ .DS_Store sandbox/ .pytest_cache/ -.ruff_cache/ \ No newline at end of file +.ruff_cache/ diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 81e11347..7c8f4d2b 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -714,6 +714,7 @@ class ExportIOConfig(BaseIOConfig): split_transition_data: bool = True split_runs: bool = False include_transition_data: bool = True # Whether to include transition data in parquet export + exclude_feature_var: bool = False # Whether to exclude FEATURE_MS1/MS2 variance (VAR_*) columns # SqMass: Export to parquet pqp_file: Optional[str] = None # Path to PQP file for precursor/transition mapping diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 3a1ba83e..f08139a4 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -617,6 +617,13 @@ def export_library( show_default=True, help="Include transition data in the exported parquet file(s). When disabled, only precursor-level data is exported.", ) +@click.option( + "--exclude_feature_var/--no-exclude_feature_var", + "exclude_feature_var", + default=False, + show_default=True, + help="Exclude feature variance columns (VAR_*) from FEATURE_MS1 and FEATURE_MS2 tables. Significantly speeds up export and reduces file size.", +) @measure_memory_usage_and_time def export_parquet( infile, @@ -630,6 +637,7 @@ def export_parquet( compression, compression_level, include_transition_data, + exclude_feature_var, ): """ Export OSW or sqMass to parquet format @@ -666,6 +674,7 @@ def export_parquet( compression_method=compression, compression_level=compression_level, include_transition_data=include_transition_data, + exclude_feature_var=exclude_feature_var, ) writer = WriterDispatcher.get_writer(config) diff --git a/pyprophet/cli/merge.py b/pyprophet/cli/merge.py index 941a4fc3..3a250fcd 100644 --- a/pyprophet/cli/merge.py +++ b/pyprophet/cli/merge.py @@ -59,8 +59,13 @@ def merge(): is_flag=True, help="Merge OSW output files that have already been scored.", ) +@click.option( + "--fresh", + is_flag=True, + help="Start from scratch, ignoring any existing merged.osw file. If enabled, removes existing output file before starting.", +) @measure_memory_usage_and_time -def merge_osw(infiles, outfile, same_run, templatefile, merged_post_scored_runs): +def merge_osw(infiles, outfile, same_run, templatefile, merged_post_scored_runs, fresh): """ Merge multiple OSW files and (for large experiments, it is recommended to subsample first). """ @@ -70,7 +75,7 @@ def merge_osw(infiles, outfile, same_run, templatefile, merged_post_scored_runs) "At least one PyProphet input file needs to be provided." ) - _merge_osw(infiles, outfile, templatefile, same_run, merged_post_scored_runs) + _merge_osw(infiles, outfile, templatefile, same_run, merged_post_scored_runs, fresh) @click.command(name="parquet", cls=AdvancedHelpCommand) diff --git a/pyprophet/io/export/osw.py b/pyprophet/io/export/osw.py index 484e1c60..5806d354 100644 --- a/pyprophet/io/export/osw.py +++ b/pyprophet/io/export/osw.py @@ -264,6 +264,10 @@ def _read_unscored_data(self, con): def _build_score_sql(self, con): """Build SQL fragment for score columns in unscored files.""" + # Skip if exclude_feature_var is enabled + if self.config.exclude_feature_var: + return "" + score_sql = "" if check_sqlite_table(con, "FEATURE_MS1"): score_sql = write_scores_sql_command( @@ -1516,7 +1520,23 @@ def _write_parquet(self) -> None: def _convert_to_split_parquet(self) -> None: """Convert OSW to split parquet format""" conn = duckdb.connect(":memory:") - load_sqlite_scanner(conn) + + try: + load_sqlite_scanner(conn) + except Exception as scanner_error: + # If sqlite_scanner fails to load (e.g., in containers without internet), + # provide helpful guidance but continue with fallback + if "Failed to download extension" in str(scanner_error) or "Connection timed out" in str(scanner_error): + click.echo( + "Warning: sqlite_scanner extension could not be loaded (likely in container without internet access).\n" + "To fix: Set DUCKDB_EXTENSION_DIRECTORY environment variable to a directory with pre-downloaded extensions.\n" + "Or pre-download extensions on your host with: " + "python3 -c 'import duckdb; duckdb.connect(\":memory:\").execute(\"LOAD sqlite_scanner\")'\n" + "Continuing with alternative method...", + err=True + ) + else: + raise try: # Prepare column information @@ -1533,7 +1553,23 @@ def _convert_to_split_parquet(self) -> None: def _convert_to_single_parquet(self) -> None: """Convert OSW to single parquet file""" conn = duckdb.connect(":memory:") - load_sqlite_scanner(conn) + + try: + load_sqlite_scanner(conn) + except Exception as scanner_error: + # If sqlite_scanner fails to load (e.g., in containers without internet), + # provide helpful guidance but continue with fallback + if "Failed to download extension" in str(scanner_error) or "Connection timed out" in str(scanner_error): + click.echo( + "Warning: sqlite_scanner extension could not be loaded (likely in container without internet access).\n" + "To fix: Set DUCKDB_EXTENSION_DIRECTORY environment variable to a directory with pre-downloaded extensions.\n" + "Or pre-download extensions on your host with: " + "python3 -c 'import duckdb; duckdb.connect(\":memory:\").execute(\"LOAD sqlite_scanner\")'\n" + "Continuing with alternative method...", + err=True + ) + else: + raise try: # Prepare column information @@ -1614,9 +1650,45 @@ def _prepare_column_info(self, conn) -> dict: column_info["score_peptide_contexts"] = self._check_contexts( sql_conn, "SCORE_PEPTIDE" ) + + # Create necessary indices to speed up joins + logger.info("Creating indices for faster export") + self._create_export_indices(sql_conn) return column_info + def _create_export_indices(self, sql_conn: sqlite3.Connection) -> None: + """Create indices to optimize join performance during export""" + indices_to_create = [ + ("PRECURSOR_PEPTIDE_MAPPING", "PRECURSOR_ID", "idx_ppm_precursor_id"), + ("PRECURSOR_PEPTIDE_MAPPING", "PEPTIDE_ID", "idx_ppm_peptide_id"), + ("PEPTIDE_PROTEIN_MAPPING", "PEPTIDE_ID", "idx_pprotm_peptide_id"), + ("PEPTIDE_PROTEIN_MAPPING", "PROTEIN_ID", "idx_pprotm_protein_id"), + ("PEPTIDE_GENE_MAPPING", "PEPTIDE_ID", "idx_pgm_peptide_id"), + ("PEPTIDE_GENE_MAPPING", "GENE_ID", "idx_pgm_gene_id"), + ("FEATURE", "PRECURSOR_ID", "idx_feat_precursor_id"), + ("FEATURE", "RUN_ID", "idx_feat_run_id"), + ("FEATURE_MS1", "FEATURE_ID", "idx_feat_ms1_feature_id"), + ("FEATURE_MS2", "FEATURE_ID", "idx_feat_ms2_feature_id"), + ("FEATURE_TRANSITION", "FEATURE_ID", "idx_feat_trans_feature_id"), + ("FEATURE_TRANSITION", "TRANSITION_ID", "idx_feat_trans_trans_id"), + ("TRANSITION_PRECURSOR_MAPPING", "TRANSITION_ID", "idx_tpm_transition_id"), + ("TRANSITION_PRECURSOR_MAPPING", "PRECURSOR_ID", "idx_tpm_precursor_id"), + ("TRANSITION_PEPTIDE_MAPPING", "TRANSITION_ID", "idx_tpeptm_transition_id"), + ("TRANSITION_PEPTIDE_MAPPING", "PEPTIDE_ID", "idx_tpeptm_peptide_id"), + ] + + for table, column, index_name in indices_to_create: + try: + sql_conn.execute( + f"CREATE INDEX IF NOT EXISTS {index_name} ON {table}({column})" + ) + except sqlite3.OperationalError as e: + logger.debug(f"Could not create index {index_name}: {e}") + + sql_conn.commit() + logger.debug("Indices created for export optimization") + def _export_split_by_run(self, conn, column_info: dict) -> None: """Export data split by run into separate directories""" os.makedirs(self.config.outfile, exist_ok=True) @@ -1707,30 +1779,24 @@ def _export_combined(self, conn, column_info: dict) -> None: self._export_alignment_data(conn) def _export_single_file(self, conn, column_info: dict) -> None: - """Export all data to a single parquet file""" - # Create temp table with combined schema - logger.debug("Creating temporary table for combined export") - self._create_temp_table(conn, column_info) + """Export all data to a single parquet file using streaming (UNION ALL)""" + logger.info(f"Exporting combined data to {self.config.outfile}") - # Insert precursor data - logger.debug("Inserting precursor data into temp table") + # Build precursor query precursor_query = self._build_combined_precursor_query(conn, column_info) - # print(precursor_query) - conn.execute(f"INSERT INTO temp_table {precursor_query}") - # Insert transition data if requested + # Build combined query if self.config.include_transition_data: - logger.debug("Inserting transition data into temp table") + logger.debug("Including transition data in export") transition_query = self._build_combined_transition_query(column_info) - conn.execute(f"INSERT INTO temp_table {transition_query}") + # Combine queries with UNION ALL - this streams directly to parquet + combined_query = f"{precursor_query}\nUNION ALL\n{transition_query}" else: - logger.info( - "Skipping transition data export (include_transition_data=False)" - ) + logger.info("Skipping transition data export (include_transition_data=False)") + combined_query = precursor_query - # Export to parquet - logger.info(f"Exporting combined data to {self.config.outfile}") - self._execute_copy_query(conn, "SELECT * FROM temp_table", self.config.outfile) + # Stream directly to parquet file without intermediate temp table + self._execute_copy_query(conn, combined_query, self.config.outfile) # Export alignment data if exists if column_info["feature_ms2_alignment_exists"]: @@ -1775,44 +1841,16 @@ def _register_peptide_ipf_map(self, conn: duckdb.DuckDBPyConnection) -> None: ) def _create_unimod_to_codename_peptide_id_mapping_table(self) -> None: - """Create peptide unimod to codename mapping table in SQLite database.""" + """Create peptide unimod to codename mapping table in SQLite database. + + Processes peptides in chunks to reduce memory footprint for large datasets. + """ logger.info( "Generating peptide unimod to codename mapping and storing in SQLite" ) with sqlite3.connect(self.config.infile) as sql_conn: - # First get the peptide table and process it with pyopenms - peptide_df = pd.read_sql_query( - "SELECT ID, MODIFIED_SEQUENCE FROM PEPTIDE", sql_conn - ) - - peptide_df["codename"] = peptide_df["MODIFIED_SEQUENCE"].apply( - unimod_to_codename - ) - - # Create the merged mapping - unimod_mask = peptide_df["MODIFIED_SEQUENCE"].str.contains("UniMod") - merged_df = pd.merge( - peptide_df[unimod_mask][["codename", "ID"]], - peptide_df[~unimod_mask][["codename", "ID"]], - on="codename", - suffixes=("_unimod", "_codename"), - how="outer", - ) - - # Fill NaN values in the 'ID_codename' column with the 'ID_unimod' values - merged_df["ID_codename"] = merged_df["ID_codename"].fillna( - merged_df["ID_unimod"] - ) - # Fill NaN values in the 'ID_unimod' column with the 'ID_codename' values - merged_df["ID_unimod"] = merged_df["ID_unimod"].fillna( - merged_df["ID_codename"] - ) - - merged_df["ID_unimod"] = merged_df["ID_unimod"].astype(int) - merged_df["ID_codename"] = merged_df["ID_codename"].astype(int) - - # Create the UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING table in SQLite + # Create the mapping table first sql_conn.execute( """ CREATE TABLE IF NOT EXISTS UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING ( @@ -1824,14 +1862,73 @@ def _create_unimod_to_codename_peptide_id_mapping_table(self) -> None: """ ) sql_conn.execute("DELETE FROM UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING") + sql_conn.commit() - # Insert the data into SQLite table - merged_df[["ID_unimod", "ID_codename", "codename"]].to_sql( - "UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING", - sql_conn, - if_exists="append", - index=False, - ) + # Get total count for progress tracking + total_count = sql_conn.execute( + "SELECT COUNT(*) FROM PEPTIDE" + ).fetchone()[0] + logger.info(f"Processing {total_count} peptides in chunks") + + # Process peptides in chunks to reduce memory footprint + chunk_size = 50000 # Process 50k peptides at a time + processed = 0 + + while processed < total_count: + # Fetch chunk of peptides + peptide_chunk = pd.read_sql_query( + f"""SELECT ID, MODIFIED_SEQUENCE FROM PEPTIDE + LIMIT {chunk_size} OFFSET {processed}""", + sql_conn, + ) + + if peptide_chunk.empty: + break + + # Process chunk + peptide_chunk["codename"] = peptide_chunk["MODIFIED_SEQUENCE"].apply( + unimod_to_codename + ) + + # Create mapping for this chunk + unimod_mask = peptide_chunk["MODIFIED_SEQUENCE"].str.contains("UniMod", na=False) + unimod_chunk = peptide_chunk[unimod_mask][["codename", "ID"]].copy() + unimod_chunk.columns = ["codename", "ID_unimod"] + + codename_chunk = peptide_chunk[~unimod_mask][["codename", "ID"]].copy() + codename_chunk.columns = ["codename", "ID_codename"] + + # Merge on codename + merged_chunk = pd.merge( + unimod_chunk, + codename_chunk, + on="codename", + how="outer", + ) + + # Fill NaN values + merged_chunk["ID_codename"] = merged_chunk["ID_codename"].fillna( + merged_chunk["ID_unimod"] + ) + merged_chunk["ID_unimod"] = merged_chunk["ID_unimod"].fillna( + merged_chunk["ID_codename"] + ) + + merged_chunk["ID_unimod"] = merged_chunk["ID_unimod"].astype(int) + merged_chunk["ID_codename"] = merged_chunk["ID_codename"].astype(int) + + # Insert chunk into SQLite + merged_chunk[["ID_unimod", "ID_codename", "codename"]].to_sql( + "UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING", + sql_conn, + if_exists="append", + index=False, + ) + + processed += len(peptide_chunk) + logger.debug(f"Processed {processed}/{total_count} peptides") + + sql_conn.commit() # Create indices for better performance sql_conn.execute( @@ -1845,8 +1942,12 @@ def _create_unimod_to_codename_peptide_id_mapping_table(self) -> None: ) sql_conn.commit() + + final_count = sql_conn.execute( + "SELECT COUNT(*) FROM UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING" + ).fetchone()[0] logger.info( - f"Successfully created UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING table with {len(merged_df)} mappings" + f"Successfully created UNIMOD_TO_CODENAME_PEPTIDE_ID_MAPPING table with {final_count} mappings" ) def _insert_precursor_peptide_ipf_map(self) -> None: @@ -2472,7 +2573,7 @@ def _export_alignment_data(self, conn, path: str = None) -> None: has_score_alignment = check_sqlite_table(sql_conn, "SCORE_ALIGNMENT") if has_score_alignment: - # Export with alignment scores + # Export with alignment scores - use ROW_NUMBER to get best score per feature query = f""" SELECT FEATURE_MS2_ALIGNMENT.ALIGNMENT_ID, @@ -2496,9 +2597,13 @@ def _export_alignment_data(self, conn, path: str = None) -> None: SCORE_ALIGNMENT.QVALUE AS QVALUE FROM sqlite_scan('{self.config.infile}', 'FEATURE_MS2_ALIGNMENT') AS FEATURE_MS2_ALIGNMENT LEFT JOIN ( - SELECT FEATURE_ID, SCORE, PEP, QVALUE, MIN(QVALUE) as MIN_QVALUE - FROM sqlite_scan('{self.config.infile}', 'SCORE_ALIGNMENT') - GROUP BY FEATURE_ID + SELECT FEATURE_ID, SCORE, PEP, QVALUE + FROM ( + SELECT FEATURE_ID, SCORE, PEP, QVALUE, + ROW_NUMBER() OVER (PARTITION BY FEATURE_ID ORDER BY QVALUE ASC) as rn + FROM sqlite_scan('{self.config.infile}', 'SCORE_ALIGNMENT') + ) t + WHERE rn = 1 ) AS SCORE_ALIGNMENT ON FEATURE_MS2_ALIGNMENT.ALIGNED_FEATURE_ID = SCORE_ALIGNMENT.FEATURE_ID """ diff --git a/pyprophet/io/util.py b/pyprophet/io/util.py index d4f36b7a..f1322126 100644 --- a/pyprophet/io/util.py +++ b/pyprophet/io/util.py @@ -384,18 +384,41 @@ def is_valid_multi_split_parquet_dir(path): def load_sqlite_scanner(conn: duckdb.DuckDBPyConnection): """ Ensures the `sqlite_scanner` extension is installed and loaded in DuckDB. + Handles cases where the extension cannot be downloaded (e.g., in containers without internet). """ try: conn.execute("LOAD sqlite_scanner") except Exception as e: - if "Extension 'sqlite_scanner' not found" in str(e): + error_msg = str(e) + + # Check if it's a network/download error (e.g., in containers) + if ("Failed to download extension" in error_msg or + "Connection timed out" in error_msg or + "Network unreachable" in error_msg): + from loguru import logger + logger.warning( + f"Cannot download sqlite_scanner extension (likely in container without internet). " + f"Attempting to load from local cache or using fallback method.\n" + f"To fix: Set DUCKDB_EXTENSION_DIRECTORY environment variable to a directory with pre-downloaded extensions.\n" + f"Details: {error_msg}" + ) + # Try to load from local cache + try: + conn.execute("LOAD sqlite_scanner") + except Exception: + # If it still fails, the export will fall back to direct sqlite3 if available + logger.error( + "Could not load sqlite_scanner. Export may use slower fallback method. " + "For better performance, pre-download extensions: " + "python3 -c 'import duckdb; duckdb.connect(\":memory:\").execute(\"LOAD sqlite_scanner\")'" + ) + + elif "Extension 'sqlite_scanner' not found" in error_msg: try: conn.execute("INSTALL sqlite_scanner") conn.execute("LOAD sqlite_scanner") except Exception as install_error: - if "already installed but the origin is different" in str( - install_error - ): + if "already installed but the origin is different" in str(install_error): conn.execute("FORCE INSTALL sqlite_scanner") conn.execute("LOAD sqlite_scanner") else: diff --git a/pyprophet/util.py b/pyprophet/util.py index 85b1e2be..68b41a55 100644 --- a/pyprophet/util.py +++ b/pyprophet/util.py @@ -306,24 +306,111 @@ def reduce_osw(infile, outfile): click.echo("Info: OSW file was reduced for multi-run scoring.") -def merge_osw(infiles, outfile, templatefile, same_run, merge_post_scored_runs): +def merge_osw(infiles, outfile, templatefile, same_run, merge_post_scored_runs, fresh=False): conn = sqlite3.connect(infiles[0]) reduced = check_sqlite_table(conn, "SCORE_MS2") conn.close() if reduced and not merge_post_scored_runs: click.echo("Calling reduced osws merge function") - merge_oswr(infiles, outfile, templatefile, same_run) + merge_oswr(infiles, outfile, templatefile, same_run, fresh=fresh) elif merge_post_scored_runs: click.echo("Calling post scored osws merge function") - merge_oswps(infiles, outfile, templatefile, same_run) + merge_oswps(infiles, outfile, templatefile, same_run, fresh=fresh) else: click.echo("Calling pre scored osws merge function") - merge_osws(infiles, outfile, templatefile, same_run) + merge_osws(infiles, outfile, templatefile, same_run, fresh=fresh) -def merge_osws(infiles, outfile, templatefile, same_run): - # Copy the first file to have a template - copyfile(templatefile, outfile) +def _table_has_data(outfile, table_name): + """Check if a table already has data (for resume capability).""" + try: + conn = sqlite3.connect(outfile) + c = conn.cursor() + if not check_sqlite_table(conn, table_name): + conn.close() + return False + result = c.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone() + conn.close() + return result[0] > 0 if result else False + except Exception: + return False + + +def _get_merge_progress(outfile, tables_to_check, total_files): + """Get detailed merge progress per table: how many files have been merged for each table.""" + progress = {} + try: + conn = sqlite3.connect(outfile) + c = conn.cursor() + + # Create metadata table if needed (for old partial merges that don't have it) + if not check_sqlite_table(conn, "MERGE_PROGRESS"): + click.echo("Info: Creating MERGE_PROGRESS tracking table (old partial merge detected)...") + c.executescript(""" + CREATE TABLE MERGE_PROGRESS ( + table_name TEXT PRIMARY KEY, + files_completed INTEGER DEFAULT 0 + ); + """) + + # For old partial merges: mark RUN as complete (already merged), reset features to 0 + init_values = {} + for table in tables_to_check: + if table == "RUN": + init_values[table] = total_files # RUN already has data, mark as complete + else: + init_values[table] = 0 # Feature tables need to be remerged + + for table in tables_to_check: + c.execute( + "INSERT INTO MERGE_PROGRESS (table_name, files_completed) VALUES (?, ?)", + (table, init_values[table]) + ) + conn.commit() + click.echo("Info: MERGE_PROGRESS initialized. Feature tables will restart from file 0.") + + # Query progress for each table + for table in tables_to_check: + try: + result = c.execute( + "SELECT files_completed FROM MERGE_PROGRESS WHERE table_name = ?", + (table,) + ).fetchone() + progress[table] = result[0] if result else 0 + except: + progress[table] = 0 + + conn.close() + return progress + except Exception as e: + click.echo(f"Warning: Could not read merge progress: {e}. Starting fresh.") + return {table: 0 for table in tables_to_check} + + + +def merge_osws(infiles, outfile, templatefile, same_run, fresh=False): + import time + from datetime import timedelta + + tables_to_merge = ["RUN", "FEATURE", "FEATURE_MS1", "FEATURE_MS2", "FEATURE_TRANSITION"] + total_files = len(infiles) # Calculate early for progress tracking + + # Check if this is a resume operation (unless fresh flag is set) + is_resume = (not fresh) and os.path.exists(outfile) and _table_has_data(outfile, "RUN") + if is_resume: + click.echo(f"Info: Resuming merge for {outfile}. Checking which tables still need to be merged...") + progress = _get_merge_progress(outfile, tables_to_merge, total_files) + click.echo(f"Merge progress: RUN={progress['RUN']}, FEATURE={progress['FEATURE']}, MS1={progress['FEATURE_MS1']}, MS2={progress['FEATURE_MS2']}, TRANS={progress['FEATURE_TRANSITION']} files merged") + else: + # Fresh merge: copy template and create empty tables + if fresh and os.path.exists(outfile): + click.echo(f"Info: --fresh flag set. Removing existing {outfile} and starting from scratch...") + os.remove(outfile) + elif os.path.exists(outfile): + os.remove(outfile) + copyfile(templatefile, outfile) + progress = {table: 0 for table in tables_to_merge} + conn = sqlite3.connect(outfile) c = conn.cursor() if same_run: @@ -335,175 +422,300 @@ def merge_osws(infiles, outfile, templatefile, same_run): ) runid, rname = result[0] - c.executescript( - f""" -PRAGMA synchronous = OFF; - -DROP TABLE IF EXISTS RUN; - + # Only create empty tables if not resuming + # OR if resuming but MERGE_PROGRESS was just created (old partial merge) - need to clear feature tables + need_to_recreate_feature_tables = is_resume and all(v == 0 for v in progress.values()) + + if not is_resume or need_to_recreate_feature_tables: + if need_to_recreate_feature_tables: + click.echo("Info: Detected old partial merge without progress tracking. Clearing feature tables to avoid duplicates...") + + drop_statements = """ DROP TABLE IF EXISTS FEATURE; - DROP TABLE IF EXISTS FEATURE_MS1; - DROP TABLE IF EXISTS FEATURE_MS2; - DROP TABLE IF EXISTS FEATURE_TRANSITION; - DROP TABLE IF EXISTS SCORE_MS1; - DROP TABLE IF EXISTS SCORE_MS2; - DROP TABLE IF EXISTS SCORE_TRANSITION; - DROP TABLE IF EXISTS SCORE_PEPTIDE; - DROP TABLE IF EXISTS SCORE_PROTEIN; - DROP TABLE IF EXISTS SCORE_IPF; +""" + # Only drop RUN table if fresh merge (not resuming old partial) + if not is_resume: + drop_statements = "DROP TABLE IF EXISTS RUN;\n" + drop_statements + + c.executescript( + f""" +PRAGMA synchronous = OFF; +PRAGMA cache_size = 50000; +PRAGMA temp_store = MEMORY; -ATTACH DATABASE "{infiles[0]}" AS sdb; +{drop_statements} -CREATE TABLE RUN AS SELECT * FROM sdb.RUN LIMIT 0; +ATTACH DATABASE "{infiles[0]}" AS sdb; CREATE TABLE FEATURE AS SELECT * FROM sdb.FEATURE LIMIT 0; - CREATE TABLE FEATURE_MS1 AS SELECT * FROM sdb.FEATURE_MS1 LIMIT 0; - CREATE TABLE FEATURE_MS2 AS SELECT * FROM sdb.FEATURE_MS2 LIMIT 0; - CREATE TABLE FEATURE_TRANSITION AS SELECT * FROM sdb.FEATURE_TRANSITION LIMIT 0; DETACH DATABASE sdb; """ - ) + ) + + # Only initialize MERGE_PROGRESS for fresh merges (not old partial merges) + if not is_resume: + c.executescript(""" +DROP TABLE IF EXISTS MERGE_PROGRESS; + +CREATE TABLE MERGE_PROGRESS ( + table_name TEXT PRIMARY KEY, + files_completed INTEGER DEFAULT 0 +); + +INSERT INTO MERGE_PROGRESS (table_name, files_completed) VALUES ('RUN', 0); +INSERT INTO MERGE_PROGRESS (table_name, files_completed) VALUES ('FEATURE', 0); +INSERT INTO MERGE_PROGRESS (table_name, files_completed) VALUES ('FEATURE_MS1', 0); +INSERT INTO MERGE_PROGRESS (table_name, files_completed) VALUES ('FEATURE_MS2', 0); +INSERT INTO MERGE_PROGRESS (table_name, files_completed) VALUES ('FEATURE_TRANSITION', 0); +""") + + if not is_resume: + # For fresh merges, also create RUN table from first input file + c.executescript(f""" +ATTACH DATABASE "{infiles[0]}" AS sdb; +CREATE TABLE RUN AS SELECT * FROM sdb.RUN LIMIT 0; +DETACH DATABASE sdb; +""") + else: + c.executescript("PRAGMA synchronous = OFF; PRAGMA cache_size = 50000; PRAGMA temp_store = MEMORY;") conn.commit() conn.close() - for infile in infiles: - conn = sqlite3.connect(outfile) - c = conn.cursor() - - # Only create a single run entry (all files are presumably from the same run) - if same_run: - c.executescript( - f"""INSERT INTO RUN (ID, FILENAME) VALUES ({runid}, '{rname}')""" - ) - break - else: - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - - INSERT INTO RUN SELECT * FROM sdb.RUN; - - DETACH DATABASE sdb; + # ===== BATCH PROCESSING OPTIMIZATION ===== + batch_size = 50 + start_time = time.time() + + def estimate_time_remaining(files_done, total_files, elapsed_seconds): + if files_done == 0: + return None + rate = elapsed_seconds / files_done + remaining_files = total_files - files_done + remaining_seconds = rate * remaining_files + return timedelta(seconds=int(remaining_seconds)) + + # ===== MERGE RUN DATA (batch processed, resume from last completed file) ===== + run_start_file = progress["RUN"] + if run_start_file < total_files: + if not same_run: + click.echo(f"Merging RUN data (resuming from file {run_start_file}/{total_files})...") + for batch_num in range(run_start_file, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] + conn = sqlite3.connect(outfile) + c = conn.cursor() + + for idx, infile in enumerate(batch_files): + db_alias = f"run_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO RUN SELECT * FROM {db_alias}.RUN; + DETACH DATABASE {db_alias}; """ - ) - - conn.commit() - conn.close() - - click.echo(f"Info: Merged runs of file {infile} to {outfile}.") - - # Now merge the run-specific data into the output file: - # Note: only tables FEATURE, FEATURE_MS1, FEATURE_MS2 and FEATURE_TRANSITION are run-specific - for infile in infiles: - conn = sqlite3.connect(outfile) - c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - - INSERT INTO FEATURE SELECT * FROM sdb.FEATURE; - - DETACH DATABASE sdb; + ) + + conn.commit() + conn.close() + + files_done = min(batch_num + batch_size, total_files) + + # Update progress tracking + conn = sqlite3.connect(outfile) + conn.execute("UPDATE MERGE_PROGRESS SET files_completed = ? WHERE table_name = 'RUN'", (files_done,)) + conn.commit() + conn.close() + + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged RUN data for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + else: + conn = sqlite3.connect(outfile) + c = conn.cursor() + c.executescript(f"""INSERT INTO RUN (ID, FILENAME) VALUES ({runid}, '{rname}')""") + conn.execute("UPDATE MERGE_PROGRESS SET files_completed = ? WHERE table_name = 'RUN'", (total_files,)) + conn.commit() + conn.close() + else: + click.echo("Skipping RUN data (already merged)") + + # ===== MERGE FEATURE DATA (batch processed, resume from last completed file) ===== + feature_start_file = progress["FEATURE"] + if feature_start_file < total_files: + click.echo(f"\nMerging FEATURE data (resuming from file {feature_start_file}/{total_files})...") + for batch_num in range(feature_start_file, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] + conn = sqlite3.connect(outfile) + c = conn.cursor() + + for idx, infile in enumerate(batch_files): + db_alias = f"feat_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE SELECT * FROM {db_alias}.FEATURE; + DETACH DATABASE {db_alias}; """ - ) - - conn.commit() - conn.close() - - click.echo(f"Info: Merged generic features of file {infile} to {outfile}.") - - if same_run: - conn = sqlite3.connect(outfile) - c = conn.cursor() - - # Fix run id assuming we only have a single run - c.executescript(f"""UPDATE FEATURE SET RUN_ID = {runid}""") - - conn.commit() - conn.close() - - for infile in infiles: - conn = sqlite3.connect(outfile) - c = conn.cursor() - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - - INSERT INTO FEATURE_MS1 - SELECT * - FROM sdb.FEATURE_MS1; + ) + + conn.commit() + conn.close() + + files_done = min(batch_num + batch_size, total_files) + + # Update progress tracking + conn = sqlite3.connect(outfile) + conn.execute("UPDATE MERGE_PROGRESS SET files_completed = ? WHERE table_name = 'FEATURE'", (files_done,)) + conn.commit() + conn.close() + + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged FEATURE data for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) - DETACH DATABASE sdb; + if same_run: + conn = sqlite3.connect(outfile) + c = conn.cursor() + c.executescript(f"""UPDATE FEATURE SET RUN_ID = {runid}""") + conn.commit() + conn.close() + else: + click.echo("Skipping FEATURE data (already merged)") + + # ===== MERGE MS1 FEATURES (batch processed, resume from last completed file) ===== + ms1_start_file = progress["FEATURE_MS1"] + if ms1_start_file < total_files: + click.echo(f"\nMerging MS1 features (resuming from file {ms1_start_file}/{total_files})...") + for batch_num in range(ms1_start_file, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] + conn = sqlite3.connect(outfile) + c = conn.cursor() + + for idx, infile in enumerate(batch_files): + db_alias = f"ms1_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE_MS1 SELECT * FROM {db_alias}.FEATURE_MS1; + DETACH DATABASE {db_alias}; """ - ) - - conn.commit() - conn.close() - - click.echo(f"Info: Merged MS1 features of file {infile} to {outfile}.") - - for infile in infiles: - conn = sqlite3.connect(outfile) - c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - - INSERT INTO FEATURE_MS2 - SELECT * - FROM sdb.FEATURE_MS2; - - DETACH DATABASE sdb; + ) + + conn.commit() + conn.close() + + files_done = min(batch_num + batch_size, total_files) + + # Update progress tracking + conn = sqlite3.connect(outfile) + conn.execute("UPDATE MERGE_PROGRESS SET files_completed = ? WHERE table_name = 'FEATURE_MS1'", (files_done,)) + conn.commit() + conn.close() + + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged MS1 features for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + else: + click.echo("Skipping MS1 features (already merged)") + + # ===== MERGE MS2 FEATURES (batch processed, resume from last completed file) ===== + ms2_start_file = progress["FEATURE_MS2"] + if ms2_start_file < total_files: + click.echo(f"\nMerging MS2 features (resuming from file {ms2_start_file}/{total_files})...") + for batch_num in range(ms2_start_file, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] + conn = sqlite3.connect(outfile) + c = conn.cursor() + + for idx, infile in enumerate(batch_files): + db_alias = f"ms2_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE_MS2 SELECT * FROM {db_alias}.FEATURE_MS2; + DETACH DATABASE {db_alias}; """ - ) - - conn.commit() - conn.close() - - click.echo(f"Info: Merged MS2 features of file {infile} to {outfile}.") - - for infile in infiles: - conn = sqlite3.connect(outfile) - c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - - INSERT INTO FEATURE_TRANSITION - SELECT * - FROM sdb.FEATURE_TRANSITION; - - DETACH DATABASE sdb; + ) + + conn.commit() + conn.close() + + files_done = min(batch_num + batch_size, total_files) + + # Update progress tracking + conn = sqlite3.connect(outfile) + conn.execute("UPDATE MERGE_PROGRESS SET files_completed = ? WHERE table_name = 'FEATURE_MS2'", (files_done,)) + conn.commit() + conn.close() + + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged MS2 features for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + else: + click.echo("Skipping MS2 features (already merged)") + + # ===== MERGE TRANSITION FEATURES (batch processed, resume from last completed file) ===== + trans_start_file = progress["FEATURE_TRANSITION"] + if trans_start_file < total_files: + click.echo(f"\nMerging transition features (resuming from file {trans_start_file}/{total_files})...") + for batch_num in range(trans_start_file, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] + conn = sqlite3.connect(outfile) + c = conn.cursor() + + for idx, infile in enumerate(batch_files): + db_alias = f"trans_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE_TRANSITION SELECT * FROM {db_alias}.FEATURE_TRANSITION; + DETACH DATABASE {db_alias}; """ - ) - - conn.commit() - conn.close() + ) + + conn.commit() + conn.close() + + files_done = min(batch_num + batch_size, total_files) + + # Update progress tracking + conn = sqlite3.connect(outfile) + conn.execute("UPDATE MERGE_PROGRESS SET files_completed = ? WHERE table_name = 'FEATURE_TRANSITION'", (files_done,)) + conn.commit() + conn.close() + + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged transition features for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + else: + click.echo("Skipping transition features (already merged)") - click.echo(f"Info: Merged transition features of file {infile} to {outfile}.") + click.echo("\nInfo: All pre-scored OSWS files were merged successfully.") + total_time = time.time() - start_time + click.echo(f"Total merge time: {timedelta(seconds=int(total_time))}") - click.echo("Info: All OSWS files were merged.") -def merge_oswr(infiles, outfile, templatefile, same_run): +def merge_oswr(infiles, outfile, templatefile, same_run, fresh=False): + import time + from datetime import timedelta + # Copy the template to the output file + if fresh and os.path.exists(outfile): + click.echo(f"Info: --fresh flag set. Removing existing {outfile} and starting from scratch...") + os.remove(outfile) copyfile(templatefile, outfile) conn = sqlite3.connect(outfile) c = conn.cursor() @@ -519,34 +731,24 @@ def merge_oswr(infiles, outfile, templatefile, same_run): c.executescript( """ PRAGMA synchronous = OFF; +PRAGMA cache_size = 50000; +PRAGMA temp_store = MEMORY; DROP TABLE IF EXISTS RUN; - DROP TABLE IF EXISTS FEATURE; - DROP TABLE IF EXISTS FEATURE_MS1; - DROP TABLE IF EXISTS FEATURE_MS2; - DROP TABLE IF EXISTS FEATURE_TRANSITION; - DROP TABLE IF EXISTS SCORE_MS1; - DROP TABLE IF EXISTS SCORE_MS2; - DROP TABLE IF EXISTS SCORE_TRANSITION; - DROP TABLE IF EXISTS SCORE_PEPTIDE; - DROP TABLE IF EXISTS SCORE_PROTEIN; - DROP TABLE IF EXISTS SCORE_IPF; CREATE TABLE RUN(ID INT PRIMARY KEY NOT NULL, FILENAME TEXT NOT NULL); - CREATE TABLE SCORE_MS2(FEATURE_ID INTEGER, SCORE REAL); - CREATE TABLE FEATURE(ID INT PRIMARY KEY NOT NULL, RUN_ID INT NOT NULL, PRECURSOR_ID INT NOT NULL); @@ -556,68 +758,107 @@ def merge_oswr(infiles, outfile, templatefile, same_run): conn.commit() conn.close() - for infile in infiles: + # ===== BATCH PROCESSING OPTIMIZATION ===== + batch_size = 50 + total_files = len(infiles) + start_time = time.time() + + def estimate_time_remaining(files_done, total_files, elapsed_seconds): + if files_done == 0: + return None + rate = elapsed_seconds / files_done + remaining_files = total_files - files_done + remaining_seconds = rate * remaining_files + return timedelta(seconds=int(remaining_seconds)) + + # ===== MERGE RUN DATA (batch processed) ===== + if not same_run: + click.echo("Merging RUN data...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] + conn = sqlite3.connect(outfile) + c = conn.cursor() + + for idx, infile in enumerate(batch_files): + db_alias = f"run_{batch_num}_{idx}" + c.executescript(f'ATTACH DATABASE "{infile}" AS {db_alias}; INSERT INTO RUN SELECT * FROM {db_alias}.RUN; DETACH DATABASE {db_alias};') + + conn.commit() + conn.close() + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged RUN data for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + else: conn = sqlite3.connect(outfile) c = conn.cursor() - - # Only create a single run entry (all files are presumably from the same run) - if same_run: - c.executescript( - f"""INSERT INTO RUN (ID, FILENAME) VALUES ({runid}, '{rname}')""" - ) - break - else: - c.executescript( - f'ATTACH DATABASE "{infile}" AS sdb; INSERT INTO RUN SELECT * FROM sdb.RUN; DETACH DATABASE sdb;' - ) - + c.executescript(f"""INSERT INTO RUN (ID, FILENAME) VALUES ({runid}, '{rname}')""") conn.commit() conn.close() - click.echo(f"Info: Merged runs of file {infile} to {outfile}.") - - for infile in infiles: + # ===== MERGE FEATURE DATA (batch processed) ===== + click.echo("\nMerging FEATURE data...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) c = conn.cursor() - - c.executescript( - f'ATTACH DATABASE "{infile}" AS sdb; INSERT INTO FEATURE SELECT * FROM sdb.FEATURE; DETACH DATABASE sdb;' - ) - + + for idx, infile in enumerate(batch_files): + db_alias = f"feat_{batch_num}_{idx}" + c.executescript(f'ATTACH DATABASE "{infile}" AS {db_alias}; INSERT INTO FEATURE SELECT * FROM {db_alias}.FEATURE; DETACH DATABASE {db_alias};') + conn.commit() conn.close() - - click.echo(f"Info: Merged generic features of file {infile} to {outfile}.") + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged FEATURE data for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) if same_run: conn = sqlite3.connect(outfile) c = conn.cursor() - - # Fix run id assuming we only have a single run c.executescript(f"""UPDATE FEATURE SET RUN_ID = {runid}""") - conn.commit() conn.close() - for infile in infiles: + # ===== MERGE SCORE_MS2 DATA (batch processed) ===== + click.echo("\nMerging SCORE_MS2 data...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) c = conn.cursor() - - c.executescript( - f'ATTACH DATABASE "{infile}" AS sdb; INSERT INTO SCORE_MS2 SELECT * FROM sdb.SCORE_MS2; DETACH DATABASE sdb;' - ) - + + for idx, infile in enumerate(batch_files): + db_alias = f"score_{batch_num}_{idx}" + c.executescript(f'ATTACH DATABASE "{infile}" AS {db_alias}; INSERT INTO SCORE_MS2 SELECT * FROM {db_alias}.SCORE_MS2; DETACH DATABASE {db_alias};') + conn.commit() conn.close() + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged SCORE_MS2 data for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) - click.echo(f"Info: Merged MS2 scores of file {infile} to {outfile}.") + click.echo("\nInfo: All reduced OSWR files were merged successfully.") + total_time = time.time() - start_time + click.echo(f"Total merge time: {timedelta(seconds=int(total_time))}") - click.echo("Info: All reduced OSWR files were merged.") -def merge_oswps(infiles, outfile, templatefile, same_run): +def merge_oswps(infiles, outfile, templatefile, same_run, fresh=False): + import time + from datetime import timedelta + click.echo("Info: Merging all Scored Runs.") + click.echo(f"Info: Processing {len(infiles)} OSW files (total ~{len(infiles) * 14}GB)") + # Copy the first file to have a template + if fresh and os.path.exists(outfile): + click.echo(f"Info: --fresh flag set. Removing existing {outfile} and starting from scratch...") + os.remove(outfile) copyfile(templatefile, outfile) conn = sqlite3.connect(outfile) c = conn.cursor() @@ -673,6 +914,8 @@ def merge_oswps(infiles, outfile, templatefile, same_run): c.executescript( f""" PRAGMA synchronous = OFF; + PRAGMA cache_size = 50000; + PRAGMA temp_store = MEMORY; DROP TABLE IF EXISTS RUN; DROP TABLE IF EXISTS FEATURE; DROP TABLE IF EXISTS FEATURE_MS1; @@ -702,244 +945,232 @@ def merge_oswps(infiles, outfile, templatefile, same_run): conn.commit() conn.close() - for infile in infiles: + # ===== BATCH PROCESSING OPTIMIZATION ===== + # Instead of opening/closing connection 248 times per table, + # batch process files in groups to reduce I/O overhead + batch_size = 50 # Process 50 files per batch + total_files = len(infiles) + total_batches = (total_files + batch_size - 1) // batch_size + start_time = time.time() + + def estimate_time_remaining(files_done, total_files, elapsed_seconds): + """Calculate estimated time remaining""" + if files_done == 0: + return None + rate = elapsed_seconds / files_done + remaining_files = total_files - files_done + remaining_seconds = rate * remaining_files + return timedelta(seconds=int(remaining_seconds)) + + # ===== MERGE RUN DATA (batch processed) ===== + if not same_run: + click.echo("Merging RUN data...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] + conn = sqlite3.connect(outfile) + c = conn.cursor() + + for infile in batch_files: + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS sdb_{batch_num}; + INSERT INTO RUN SELECT * FROM sdb_{batch_num}.RUN; + DETACH DATABASE sdb_{batch_num}; + """ + ) + + conn.commit() + conn.close() + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged RUN data for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + else: conn = sqlite3.connect(outfile) c = conn.cursor() - - # Only create a single run entry (all files are presumably from the same run) - if same_run: - c.executescript( - f"""INSERT INTO RUN (ID, FILENAME) VALUES ({runid}, '{rname}')""" - ) - break - else: - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO RUN SELECT * FROM sdb.RUN; - DETACH DATABASE sdb; - """ - ) - + c.executescript( + f"""INSERT INTO RUN (ID, FILENAME) VALUES ({runid}, '{rname}')""" + ) conn.commit() conn.close() - click.echo(f"Info: Merged runs of file {infile} to {outfile}.") - - # Now merge the run-specific data into the output file: - # Note: only tables FEATURE, FEATURE_MS1, FEATURE_MS2 and FEATURE_TRANSITION are run-specific - for infile in infiles: + # ===== MERGE FEATURE DATA (batch processed) ===== + click.echo("\nMerging FEATURE data...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO FEATURE SELECT * FROM sdb.FEATURE; - DETACH DATABASE sdb; + + for idx, infile in enumerate(batch_files): + db_alias = f"feat_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE SELECT * FROM {db_alias}.FEATURE; + DETACH DATABASE {db_alias}; """ - ) - + ) + conn.commit() conn.close() - - click.echo(f"Info: Merged generic features of file {infile} to {outfile}.") + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged FEATURE data for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) if same_run: conn = sqlite3.connect(outfile) c = conn.cursor() - - # Fix run id assuming we only have a single run c.executescript(f"UPDATE FEATURE SET RUN_ID = {runid}") - conn.commit() conn.close() - for infile in infiles: + # ===== MERGE MS1 FEATURES (batch processed) - CRITICAL BOTTLENECK ===== + click.echo("\nMerging MS1 features (this may take a while)...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO FEATURE_MS1 - SELECT * - FROM sdb.FEATURE_MS1; - DETACH DATABASE sdb; + + for idx, infile in enumerate(batch_files): + db_alias = f"ms1_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE_MS1 SELECT * FROM {db_alias}.FEATURE_MS1; + DETACH DATABASE {db_alias}; """ - ) - + ) + conn.commit() conn.close() - - click.echo(f"Info: Merged MS1 features of file {infile} to {outfile}.") - - for infile in infiles: + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged MS1 features for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + + # ===== MERGE MS2 FEATURES (batch processed) ===== + click.echo("\nMerging MS2 features...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO FEATURE_MS2 - SELECT * - FROM sdb.FEATURE_MS2; - DETACH DATABASE sdb; + + for idx, infile in enumerate(batch_files): + db_alias = f"ms2_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE_MS2 SELECT * FROM {db_alias}.FEATURE_MS2; + DETACH DATABASE {db_alias}; """ - ) - + ) + conn.commit() conn.close() - - click.echo(f"Info: Merged MS2 features of file {infile} to {outfile}.") - - for infile in infiles: + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged MS2 features for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + + # ===== MERGE TRANSITION FEATURES (batch processed) ===== + click.echo("\nMerging transition features...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO FEATURE_TRANSITION - SELECT * - FROM sdb.FEATURE_TRANSITION; - DETACH DATABASE sdb; + + for idx, infile in enumerate(batch_files): + db_alias = f"trans_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO FEATURE_TRANSITION SELECT * FROM {db_alias}.FEATURE_TRANSITION; + DETACH DATABASE {db_alias}; """ - ) - + ) + conn.commit() conn.close() + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged transition features for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) - click.echo(f"Info: Merged transition features of file {infile} to {outfile}.") - + # ===== MERGE ALIGNMENT FEATURES (batch processed) ===== if feature_alignment_tables_present: - for infile in infiles: - # Check if the infile contains the feature_alignment table - conn = sqlite3.connect(infile) - feature_alignment_present = check_sqlite_table(conn, "FEATURE_ALIGNMENT") - conn.close() - - if feature_alignment_present: + for alignment_table in feature_alignment_tables: + click.echo(f"\nMerging {alignment_table} data...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) - c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO FEATURE_ALIGNMENT - SELECT * - FROM sdb.FEATURE_ALIGNMENT; - DETACH DATABASE sdb; + + # Pre-check which files have this table + files_with_table = [] + for infile in batch_files: + check_conn = sqlite3.connect(infile) + if check_sqlite_table(check_conn, alignment_table): + files_with_table.append(infile) + check_conn.close() + + if files_with_table: + c = conn.cursor() + for idx, infile in enumerate(files_with_table): + db_alias = f"align_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO {alignment_table} SELECT * FROM {db_alias}.{alignment_table}; + DETACH DATABASE {db_alias}; """ - ) - - conn.commit() + ) + + conn.commit() + conn.close() - - click.echo( - f"Info: Merged feature alignment tables of file {infile} to {outfile}." - ) - else: - click.echo(f"Warn: No feature alignment table found in file {infile}.") - - # Merge FEATURE_MS2_ALIGNMENT - for infile in infiles: - conn = sqlite3.connect(infile) - feature_ms2_alignment_present = check_sqlite_table( - conn, "FEATURE_MS2_ALIGNMENT" - ) - conn.close() - - if feature_ms2_alignment_present: - conn = sqlite3.connect(outfile) - c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO FEATURE_MS2_ALIGNMENT - SELECT * - FROM sdb.FEATURE_MS2_ALIGNMENT; - DETACH DATABASE sdb; - """ - ) - - conn.commit() - conn.close() - - click.echo( - f"Info: Merged feature MS2 alignment tables of file {infile} to {outfile}." - ) - else: - click.echo( - f"Warn: No feature MS2 alignment table found in file {infile}." - ) - - # Merge FEATURE_TRANSITION_ALIGNMENT - for infile in infiles: - conn = sqlite3.connect(infile) - feature_transition_alignment_present = check_sqlite_table( - conn, "FEATURE_TRANSITION_ALIGNMENT" - ) - conn.close() - - if feature_transition_alignment_present: - conn = sqlite3.connect(outfile) - c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO FEATURE_TRANSITION_ALIGNMENT - SELECT * - FROM sdb.FEATURE_TRANSITION_ALIGNMENT; - DETACH DATABASE sdb; - """ - ) - - conn.commit() - conn.close() - - click.echo( - f"Info: Merged feature transition alignment tables of file {infile} to {outfile}." - ) - else: - click.echo( - f"Warn: No feature transition alignment table found in file {infile}." - ) - - for infile in infiles: - for score_tbl in score_tables: + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + if files_with_table: + click.echo(f"Info: Merged {alignment_table} for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) + + # ===== MERGE SCORE TABLES (batch processed) ===== + for score_tbl in score_tables: + click.echo(f"\nMerging {score_tbl} data...") + for batch_num in range(0, total_files, batch_size): + batch_files = infiles[batch_num:min(batch_num + batch_size, total_files)] conn = sqlite3.connect(outfile) c = conn.cursor() - - c.executescript( - f""" - ATTACH DATABASE "{infile}" AS sdb; - INSERT INTO {score_tbl} - SELECT * - FROM sdb.{score_tbl}; - DETACH DATABASE sdb; + + for idx, infile in enumerate(batch_files): + db_alias = f"score_{batch_num}_{idx}" + c.executescript( + f""" + ATTACH DATABASE "{infile}" AS {db_alias}; + INSERT INTO {score_tbl} SELECT * FROM {db_alias}.{score_tbl}; + DETACH DATABASE {db_alias}; """ - ) - + ) + conn.commit() conn.close() + + files_done = min(batch_num + batch_size, total_files) + elapsed = time.time() - start_time + eta = estimate_time_remaining(files_done, total_files, elapsed) + click.echo(f"Info: Merged {score_tbl} for {files_done}/{total_files} files" + (f" (ETA: {eta})" if eta else "")) - click.echo(f"Info: Merged {score_tbl} table of file {infile} to {outfile}.") - - ## Vacuum to clean and re-write rootpage indexes - conn = sqlite3.connect(outfile) - c = conn.cursor() - - c.executescript("VACUUM") - - conn.commit() - conn.close() - - click.echo(f"Info: Cleaned and re-wrote indexing meta-data for {outfile}.") + ## Skip VACUUM for now (it's slow) - SQLite will auto-optimize on next use + click.echo("\nInfo: All Post-Scored OSWS files were merged successfully.") + total_time = time.time() - start_time + click.echo(f"Total merge time: {timedelta(seconds=int(total_time))}") - click.echo("Info: All Post-Scored OSWS files were merged.") def backpropagate_oswr(infile, outfile, apply_scores): From 0ff6b3c0e02e75b052f828bd9be01069c5bad45f Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 18 Jun 2026 13:14:18 -0400 Subject: [PATCH 2/2] Refactor parquet export tests to normalize output format - Updated the test output for parquet export with IPF scores to use a normalized frame for consistency. - Changed the way sample data is printed in the tests to ensure uniform formatting across different test cases. - Ensured that the expected SCORE_IPF columns are validated and printed correctly in the test outputs. --- ...test_parquet_export_no_transition_data.out | 22 +++++++++---------- ..._export.test_parquet_export_scored_osw.out | 22 +++++++++---------- ...et_export.test_parquet_export_with_ipf.out | 20 ++++++++--------- tests/test_pyprophet_export.py | 12 +++++++--- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_no_transition_data.out b/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_no_transition_data.out index 8a62a802..b90e62c1 100644 --- a/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_no_transition_data.out +++ b/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_no_transition_data.out @@ -1,13 +1,13 @@ Exported 3410 rows with 100 columns (no transition data) Score columns found: ['SCORE_MS2_PEAK_GROUP_RANK', 'SCORE_MS2_PEP', 'SCORE_MS2_P_VALUE', 'SCORE_MS2_Q_VALUE', 'SCORE_MS2_SCORE', 'SCORE_PEPTIDE_GLOBAL_PEP', 'SCORE_PEPTIDE_GLOBAL_P_VALUE', 'SCORE_PEPTIDE_GLOBAL_Q_VALUE', 'SCORE_PEPTIDE_GLOBAL_SCORE', 'SCORE_PROTEIN_GLOBAL_PEP', 'SCORE_PROTEIN_GLOBAL_P_VALUE', 'SCORE_PROTEIN_GLOBAL_Q_VALUE', 'SCORE_PROTEIN_GLOBAL_SCORE'] - ANNOTATION DELTA_RT EXP_IM EXP_RT FEATURE_ID FEATURE_MS1_APEX_INTENSITY FEATURE_MS1_AREA_INTENSITY FEATURE_MS1_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS1_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS1_VAR_MASSDEV_SCORE FEATURE_MS1_VAR_XCORR_COELUTION FEATURE_MS1_VAR_XCORR_SHAPE FEATURE_MS2_APEX_INTENSITY FEATURE_MS2_AREA_INTENSITY FEATURE_MS2_VAR_BSERIES_SCORE FEATURE_MS2_VAR_DOTPROD_SCORE FEATURE_MS2_VAR_ELUTION_MODEL_FIT_SCORE FEATURE_MS2_VAR_INTENSITY_SCORE FEATURE_MS2_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS2_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS2_VAR_LIBRARY_CORR FEATURE_MS2_VAR_LIBRARY_DOTPROD FEATURE_MS2_VAR_LIBRARY_MANHATTAN FEATURE_MS2_VAR_LIBRARY_RMSD FEATURE_MS2_VAR_LIBRARY_ROOTMEANSQUARE FEATURE_MS2_VAR_LIBRARY_SANGLE FEATURE_MS2_VAR_LOG_SN_SCORE FEATURE_MS2_VAR_MANHATTAN_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE_WEIGHTED FEATURE_MS2_VAR_NORM_RT_SCORE FEATURE_MS2_VAR_SONAR_LAG FEATURE_MS2_VAR_SONAR_LOG_DIFF FEATURE_MS2_VAR_SONAR_LOG_SN FEATURE_MS2_VAR_SONAR_LOG_TREND FEATURE_MS2_VAR_SONAR_RSQ FEATURE_MS2_VAR_SONAR_SHAPE FEATURE_MS2_VAR_XCORR_COELUTION FEATURE_MS2_VAR_XCORR_COELUTION_WEIGHTED FEATURE_MS2_VAR_XCORR_SHAPE FEATURE_MS2_VAR_XCORR_SHAPE_WEIGHTED FEATURE_MS2_VAR_YSERIES_SCORE FEATURE_TRANSITION_APEX_INTENSITY FEATURE_TRANSITION_AREA_INTENSITY FEATURE_TRANSITION_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_TRANSITION_VAR_LOG_INTENSITY FEATURE_TRANSITION_VAR_LOG_SN_SCORE FEATURE_TRANSITION_VAR_MASSDEV_SCORE FEATURE_TRANSITION_VAR_XCORR_COELUTION FEATURE_TRANSITION_VAR_XCORR_SHAPE FILENAME GENE_DECOY GENE_ID GENE_NAME IM_leftWidth IM_rightWidth IPF_PEPTIDE_ID LEFT_WIDTH MODIFIED_SEQUENCE NORM_RT PEPTIDE_DECOY PEPTIDE_ID PRECURSOR_CHARGE PRECURSOR_DECOY PRECURSOR_GROUP_LABEL PRECURSOR_ID PRECURSOR_LIBRARY_DRIFT_TIME PRECURSOR_LIBRARY_INTENSITY PRECURSOR_LIBRARY_RT PRECURSOR_MZ PRECURSOR_TRAML_ID PRODUCT_MZ PROTEIN_ACCESSION PROTEIN_DECOY PROTEIN_ID RIGHT_WIDTH RUN_ID SCORE_MS2_PEAK_GROUP_RANK SCORE_MS2_PEP SCORE_MS2_P_VALUE SCORE_MS2_Q_VALUE SCORE_MS2_SCORE SCORE_PEPTIDE_GLOBAL_PEP SCORE_PEPTIDE_GLOBAL_P_VALUE SCORE_PEPTIDE_GLOBAL_Q_VALUE SCORE_PEPTIDE_GLOBAL_SCORE SCORE_PROTEIN_GLOBAL_PEP SCORE_PROTEIN_GLOBAL_P_VALUE SCORE_PROTEIN_GLOBAL_Q_VALUE SCORE_PROTEIN_GLOBAL_SCORE TRANSITION_CHARGE TRANSITION_DECOY TRANSITION_DETECTING TRANSITION_ID TRANSITION_LIBRARY_INTENSITY TRANSITION_ORDINAL TRANSITION_TRAML_ID TRANSITION_TYPE UNMODIFIED_SEQUENCE -0 None 65.9712 NaN 2661.55 -4409520928686189639 117220.7500 8.5464e+05 0.9835 0.1247 1.3707 0.0000 0.9907 30361.0 207283.0 9.0 0.7708 NaN 0.7811 0.9962 0.0000 0.9987 0.9978 0.0659 0.0239 0.0262 0.0725 4.7388 0.7451 0.3398 0.1793 0.0194 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9936 0.9958 11.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32 2640.5100 ADSTGTLVITDPTR(UniMod:267) 28.4379 False 33 2 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0 2705.3701 -8670811102654834151 1 0.0031 0.0029 0.0033 5.7301 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -1 None 10.1667 NaN 2605.74 260819276075322832 8790.7812 1.0401e+05 0.9555 0.2667 5.4202 5.1430 0.6532 990.0 6385.0 2.0 0.7610 NaN 0.0241 0.9216 0.1104 0.8271 0.9764 0.2223 0.0995 0.1102 0.3579 1.3130 0.7675 4.5391 3.5103 0.0032 NaN NaN NaN NaN NaN NaN 7.0474 2.3104 0.7806 0.8341 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32 2575.6399 ADSTGTLVITDPTR(UniMod:267) 26.8198 False 33 2 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0 2623.4399 -8670811102654834151 2 1.0000 0.0674 0.0685 1.2404 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -2 None 237.1922 NaN 2832.77 5163914660633416481 10419.7432 2.4187e+05 0.6123 0.4707 8.9907 4.0083 0.5985 546.0 5180.0 2.0 0.7923 NaN 0.0195 0.8418 0.0911 0.9916 0.9960 0.0958 0.0387 0.0426 0.1243 0.6699 0.6863 4.7328 2.9948 0.0690 NaN NaN NaN NaN NaN NaN 4.3568 2.0950 0.6909 0.6974 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32 2811.2000 ADSTGTLVITDPTR(UniMod:267) 33.4026 False 33 2 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0 2855.5801 -8670811102654834151 5 1.0000 0.4692 0.4692 -0.1013 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -3 None 199.4846 NaN 2795.06 6932937885234622359 4036.5601 2.5862e+04 0.1872 2.4435 1.8505 4.0083 0.6422 497.0 2693.0 4.0 0.7883 NaN 0.0101 0.6804 0.1794 0.4554 0.9481 0.3084 0.1494 0.1882 0.6202 0.6284 0.6986 5.4811 3.8885 0.0581 NaN NaN NaN NaN NaN NaN 1.6487 0.9186 0.7955 0.7971 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32 2790.7200 ADSTGTLVITDPTR(UniMod:267) 32.3092 False 33 2 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0 2811.2000 -8670811102654834151 4 1.0000 0.1994 0.2018 0.6777 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -4 None 112.9550 NaN 2708.53 8534214264242363560 5750.4717 7.3215e+04 -0.3692 0.7498 7.1610 6.7500 0.4827 539.0 3838.0 3.0 0.8181 NaN 0.0145 0.7660 0.1334 0.8344 0.9736 0.2367 0.1055 0.1166 0.3772 0.6034 0.6468 2.5636 1.1471 0.0330 NaN NaN NaN NaN NaN NaN 3.4656 0.9347 0.6790 0.7379 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32 2705.3701 ADSTGTLVITDPTR(UniMod:267) 29.8002 False 33 2 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0 2736.0901 -8670811102654834151 3 1.0000 0.1994 0.2018 0.8151 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -5 None -33.8921 NaN 2915.46 -7771919224870429764 184097.5000 1.3502e+06 0.9791 0.1077 0.4784 0.0000 0.9955 48041.0 356015.0 11.0 0.7672 NaN 0.7753 0.9926 0.0000 0.8733 0.9890 0.1414 0.0968 0.1030 0.2900 4.6343 0.7586 1.1505 1.3231 0.0100 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9987 0.9985 9.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79 2892.9299 ALGYEDATQALGR(UniMod:267) 35.8003 False 80 2 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0 2947.5500 -8670811102654834151 1 0.0031 0.0029 0.0033 5.6547 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -6 None 116.0990 NaN 3065.45 -1618439206596772741 2301.7830 1.0445e+05 -0.2527 3.0023 27.2973 6.2440 0.6421 1677.0 14023.0 8.0 0.7684 NaN 0.0305 0.4049 0.0000 0.9857 0.9992 0.0354 0.0206 0.0227 0.0631 1.2430 0.7016 4.4870 5.4277 0.0335 NaN NaN NaN NaN NaN NaN 0.5749 0.1143 0.7704 0.7332 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79 3043.1399 ALGYEDATQALGR(UniMod:267) 40.1495 False 80 2 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0 3087.5200 -8670811102654834151 2 1.0000 0.0674 0.0685 1.5182 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -7 None 159.3408 NaN 3108.69 -1528523763856537095 3486.5154 5.5320e+04 -0.4336 3.6979 7.6742 6.7217 0.6224 908.0 7875.0 4.0 0.5768 NaN 0.0171 0.6492 0.3228 0.7616 0.9878 0.1379 0.0796 0.0899 0.2518 0.7011 0.9173 6.5049 6.5553 0.0460 NaN NaN NaN NaN NaN NaN 5.2085 3.1141 0.8087 0.7578 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79 3087.5200 ALGYEDATQALGR(UniMod:267) 41.4033 False 80 2 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0 3125.0701 -8670811102654834151 5 1.0000 0.4692 0.4692 -1.2624 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -8 None -63.4234 NaN 2885.93 4932359594612728841 8025.5601 7.0488e+04 0.5051 1.5720 3.6487 4.9107 0.4855 1921.0 6866.0 5.0 0.3595 NaN 0.0150 0.4893 0.8436 -0.6868 0.8659 0.4973 0.3252 0.3576 0.9445 1.5314 1.1339 4.0154 5.9697 0.0186 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.8911 0.8734 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79 2869.0300 ALGYEDATQALGR(UniMod:267) 34.9440 False 80 2 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0 2889.5100 -8670811102654834151 4 1.0000 0.4692 0.4692 -1.1134 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -9 None 288.8942 NaN 3238.25 9173417844487548347 3238.0957 6.5173e+04 -0.7411 2.1667 10.3701 9.0000 0.5089 1051.0 9306.0 6.0 0.7827 NaN 0.0203 0.5781 0.3886 0.8789 0.9890 0.1243 0.0848 0.0992 0.2789 0.8313 0.6932 2.6921 1.8259 0.0836 NaN NaN NaN NaN NaN NaN 4.2487 2.0195 0.7744 0.7170 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79 3207.0100 ALGYEDATQALGR(UniMod:267) 45.1599 False 80 2 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0 3247.9700 -8670811102654834151 3 1.0000 0.4692 0.4692 -0.5351 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR + ANNOTATION DELTA_RT EXP_IM EXP_RT FEATURE_ID FEATURE_MS1_APEX_INTENSITY FEATURE_MS1_AREA_INTENSITY FEATURE_MS1_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS1_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS1_VAR_MASSDEV_SCORE FEATURE_MS1_VAR_XCORR_COELUTION FEATURE_MS1_VAR_XCORR_SHAPE FEATURE_MS2_APEX_INTENSITY FEATURE_MS2_AREA_INTENSITY FEATURE_MS2_VAR_BSERIES_SCORE FEATURE_MS2_VAR_DOTPROD_SCORE FEATURE_MS2_VAR_ELUTION_MODEL_FIT_SCORE FEATURE_MS2_VAR_INTENSITY_SCORE FEATURE_MS2_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS2_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS2_VAR_LIBRARY_CORR FEATURE_MS2_VAR_LIBRARY_DOTPROD FEATURE_MS2_VAR_LIBRARY_MANHATTAN FEATURE_MS2_VAR_LIBRARY_RMSD FEATURE_MS2_VAR_LIBRARY_ROOTMEANSQUARE FEATURE_MS2_VAR_LIBRARY_SANGLE FEATURE_MS2_VAR_LOG_SN_SCORE FEATURE_MS2_VAR_MANHATTAN_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE_WEIGHTED FEATURE_MS2_VAR_NORM_RT_SCORE FEATURE_MS2_VAR_SONAR_LAG FEATURE_MS2_VAR_SONAR_LOG_DIFF FEATURE_MS2_VAR_SONAR_LOG_SN FEATURE_MS2_VAR_SONAR_LOG_TREND FEATURE_MS2_VAR_SONAR_RSQ FEATURE_MS2_VAR_SONAR_SHAPE FEATURE_MS2_VAR_XCORR_COELUTION FEATURE_MS2_VAR_XCORR_COELUTION_WEIGHTED FEATURE_MS2_VAR_XCORR_SHAPE FEATURE_MS2_VAR_XCORR_SHAPE_WEIGHTED FEATURE_MS2_VAR_YSERIES_SCORE FEATURE_TRANSITION_APEX_INTENSITY FEATURE_TRANSITION_AREA_INTENSITY FEATURE_TRANSITION_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_TRANSITION_VAR_LOG_INTENSITY FEATURE_TRANSITION_VAR_LOG_SN_SCORE FEATURE_TRANSITION_VAR_MASSDEV_SCORE FEATURE_TRANSITION_VAR_XCORR_COELUTION FEATURE_TRANSITION_VAR_XCORR_SHAPE FILENAME GENE_DECOY GENE_ID GENE_NAME IM_leftWidth IM_rightWidth IPF_PEPTIDE_ID LEFT_WIDTH MODIFIED_SEQUENCE NORM_RT PEPTIDE_DECOY PEPTIDE_ID PRECURSOR_CHARGE PRECURSOR_DECOY PRECURSOR_GROUP_LABEL PRECURSOR_ID PRECURSOR_LIBRARY_DRIFT_TIME PRECURSOR_LIBRARY_INTENSITY PRECURSOR_LIBRARY_RT PRECURSOR_MZ PRECURSOR_TRAML_ID PRODUCT_MZ PROTEIN_ACCESSION PROTEIN_DECOY PROTEIN_ID RIGHT_WIDTH RUN_ID SCORE_MS2_PEAK_GROUP_RANK SCORE_MS2_PEP SCORE_MS2_P_VALUE SCORE_MS2_Q_VALUE SCORE_MS2_SCORE SCORE_PEPTIDE_GLOBAL_PEP SCORE_PEPTIDE_GLOBAL_P_VALUE SCORE_PEPTIDE_GLOBAL_Q_VALUE SCORE_PEPTIDE_GLOBAL_SCORE SCORE_PROTEIN_GLOBAL_PEP SCORE_PROTEIN_GLOBAL_P_VALUE SCORE_PROTEIN_GLOBAL_Q_VALUE SCORE_PROTEIN_GLOBAL_SCORE TRANSITION_CHARGE TRANSITION_DECOY TRANSITION_DETECTING TRANSITION_ID TRANSITION_LIBRARY_INTENSITY TRANSITION_ORDINAL TRANSITION_TRAML_ID TRANSITION_TYPE UNMODIFIED_SEQUENCE +0 NaN 65.971 NaN 2661.55 -4409520928686189639 117220.748 854645.0 0.9834 0.1247 1.3700 0.000 0.9906 30361.0 207283.0 9.0 0.7707 NaN 0.7811 0.9961 0.0000 0.9987 0.9978 0.0659 0.0239 0.0262 0.0725 4.7380 0.7450 0.3397 0.1793 0.0194 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9936 0.9958 11.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32 2640.510 ADSTGTLVITDPTR(UniMod:267) 28.437 0 33 2 0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0 2705.370 -8670811102654834151 1 0.0031 0.0029 0.0033 5.7300 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ADSTGTLVITDPTR +1 NaN 10.166 NaN 2605.74 260819276075322832 8790.781 104006.0 0.9554 0.2666 5.4200 5.142 0.6532 990.0 6385.0 2.0 0.7609 NaN 0.0241 0.9216 0.1104 0.8270 0.9764 0.2222 0.0995 0.1101 0.3578 1.3130 0.7674 4.5390 3.5100 0.0032 NaN NaN NaN NaN NaN NaN 7.0470 2.3100 0.7805 0.8341 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32 2575.639 ADSTGTLVITDPTR(UniMod:267) 26.819 0 33 2 0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0 2623.439 -8670811102654834151 2 1.0000 0.0674 0.0684 1.2400 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ADSTGTLVITDPTR +2 NaN 237.192 NaN 2832.77 5163914660633416481 10419.743 241873.0 0.6122 0.4706 8.9900 4.008 0.5985 546.0 5180.0 2.0 0.7922 NaN 0.0195 0.8417 0.0911 0.9915 0.9959 0.0958 0.0387 0.0426 0.1243 0.6698 0.6862 4.7320 2.9940 0.0690 NaN NaN NaN NaN NaN NaN 4.3560 2.0940 0.6909 0.6974 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32 2811.199 ADSTGTLVITDPTR(UniMod:267) 33.402 0 33 2 0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0 2855.580 -8670811102654834151 5 1.0000 0.4692 0.4692 -0.1013 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ADSTGTLVITDPTR +3 NaN 199.484 NaN 2795.06 6932937885234622359 4036.559 25862.3 0.1872 2.4430 1.8500 4.008 0.6421 497.0 2693.0 4.0 0.7883 NaN 0.0101 0.6803 0.1793 0.4553 0.9481 0.3083 0.1493 0.1881 0.6202 0.6284 0.6985 5.4810 3.8880 0.0581 NaN NaN NaN NaN NaN NaN 1.6480 0.9185 0.7954 0.7970 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32 2790.719 ADSTGTLVITDPTR(UniMod:267) 32.309 0 33 2 0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0 2811.199 -8670811102654834151 4 1.0000 0.1994 0.2017 0.6777 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ADSTGTLVITDPTR +4 NaN 112.954 NaN 2708.53 8534214264242363560 5750.471 73215.2 -0.3691 0.7498 7.1610 6.750 0.4827 539.0 3838.0 3.0 0.8180 NaN 0.0145 0.7659 0.1334 0.8343 0.9736 0.2367 0.1054 0.1165 0.3772 0.6034 0.6467 2.5630 1.1470 0.0330 NaN NaN NaN NaN NaN NaN 3.4650 0.9346 0.6790 0.7379 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32 2705.370 ADSTGTLVITDPTR(UniMod:267) 29.800 0 33 2 0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0 2736.090 -8670811102654834151 3 1.0000 0.1994 0.2017 0.8151 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ADSTGTLVITDPTR +5 NaN -33.892 NaN 2915.46 -7771919224870429764 184097.499 1350180.0 0.9790 0.1076 0.4783 0.000 0.9955 48041.0 356015.0 11.0 0.7671 NaN 0.7752 0.9925 0.0000 0.8732 0.9889 0.1414 0.0968 0.1029 0.2899 4.6340 0.7585 1.1500 1.3230 0.0100 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9986 0.9984 9.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79 2892.929 ALGYEDATQALGR(UniMod:267) 35.800 0 80 2 0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0 2947.550 -8670811102654834151 1 0.0031 0.0029 0.0033 5.6540 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ALGYEDATQALGR +6 NaN 116.099 NaN 3065.45 -1618439206596772741 2301.782 104447.0 -0.2526 3.0020 27.2970 6.244 0.6421 1677.0 14023.0 8.0 0.7683 NaN 0.0305 0.4049 0.0000 0.9857 0.9992 0.0354 0.0206 0.0227 0.0631 1.2420 0.7015 4.4870 5.4270 0.0335 NaN NaN NaN NaN NaN NaN 0.5749 0.1142 0.7703 0.7332 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79 3043.139 ALGYEDATQALGR(UniMod:267) 40.149 0 80 2 0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0 3087.520 -8670811102654834151 2 1.0000 0.0674 0.0684 1.5180 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ALGYEDATQALGR +7 NaN 159.340 NaN 3108.69 -1528523763856537095 3486.515 55319.7 -0.4335 3.6970 7.6740 6.721 0.6223 908.0 7875.0 4.0 0.5767 NaN 0.0171 0.6492 0.3227 0.7615 0.9878 0.1378 0.0796 0.0899 0.2518 0.7010 0.9173 6.5040 6.5550 0.0460 NaN NaN NaN NaN NaN NaN 5.2080 3.1140 0.8087 0.7577 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79 3087.520 ALGYEDATQALGR(UniMod:267) 41.403 0 80 2 0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0 3125.070 -8670811102654834151 5 1.0000 0.4692 0.4692 -1.2620 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ALGYEDATQALGR +8 NaN -63.423 NaN 2885.93 4932359594612728841 8025.560 70488.3 0.5050 1.5710 3.6480 4.910 0.4855 1921.0 6866.0 5.0 0.3594 NaN 0.0149 0.4893 0.8435 -0.6868 0.8658 0.4972 0.3251 0.3575 0.9444 1.5310 1.1330 4.0150 5.9690 0.0186 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.8911 0.8734 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79 2869.030 ALGYEDATQALGR(UniMod:267) 34.944 0 80 2 0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0 2889.510 -8670811102654834151 4 1.0000 0.4692 0.4692 -1.1130 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ALGYEDATQALGR +9 NaN 288.894 NaN 3238.25 9173417844487548347 3238.095 65173.1 -0.7410 2.1660 10.3700 9.000 0.5089 1051.0 9306.0 6.0 0.7827 NaN 0.0203 0.5781 0.3885 0.8789 0.9890 0.1242 0.0848 0.0992 0.2788 0.8313 0.6931 2.6920 1.8250 0.0836 NaN NaN NaN NaN NaN NaN 4.2480 2.0190 0.7743 0.7169 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79 3207.010 ALGYEDATQALGR(UniMod:267) 45.159 0 80 2 0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0 3247.969 -8670811102654834151 3 1.0000 0.4692 0.4692 -0.5351 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN NaN NaN ALGYEDATQALGR diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_scored_osw.out b/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_scored_osw.out index b80556ba..48d2d408 100644 --- a/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_scored_osw.out +++ b/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_scored_osw.out @@ -1,13 +1,13 @@ Exported 97964 rows with 100 columns Score columns found: ['SCORE_MS2_PEAK_GROUP_RANK', 'SCORE_MS2_PEP', 'SCORE_MS2_P_VALUE', 'SCORE_MS2_Q_VALUE', 'SCORE_MS2_SCORE', 'SCORE_PEPTIDE_GLOBAL_PEP', 'SCORE_PEPTIDE_GLOBAL_P_VALUE', 'SCORE_PEPTIDE_GLOBAL_Q_VALUE', 'SCORE_PEPTIDE_GLOBAL_SCORE', 'SCORE_PROTEIN_GLOBAL_PEP', 'SCORE_PROTEIN_GLOBAL_P_VALUE', 'SCORE_PROTEIN_GLOBAL_Q_VALUE', 'SCORE_PROTEIN_GLOBAL_SCORE'] - ANNOTATION DELTA_RT EXP_IM EXP_RT FEATURE_ID FEATURE_MS1_APEX_INTENSITY FEATURE_MS1_AREA_INTENSITY FEATURE_MS1_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS1_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS1_VAR_MASSDEV_SCORE FEATURE_MS1_VAR_XCORR_COELUTION FEATURE_MS1_VAR_XCORR_SHAPE FEATURE_MS2_APEX_INTENSITY FEATURE_MS2_AREA_INTENSITY FEATURE_MS2_VAR_BSERIES_SCORE FEATURE_MS2_VAR_DOTPROD_SCORE FEATURE_MS2_VAR_ELUTION_MODEL_FIT_SCORE FEATURE_MS2_VAR_INTENSITY_SCORE FEATURE_MS2_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS2_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS2_VAR_LIBRARY_CORR FEATURE_MS2_VAR_LIBRARY_DOTPROD FEATURE_MS2_VAR_LIBRARY_MANHATTAN FEATURE_MS2_VAR_LIBRARY_RMSD FEATURE_MS2_VAR_LIBRARY_ROOTMEANSQUARE FEATURE_MS2_VAR_LIBRARY_SANGLE FEATURE_MS2_VAR_LOG_SN_SCORE FEATURE_MS2_VAR_MANHATTAN_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE_WEIGHTED FEATURE_MS2_VAR_NORM_RT_SCORE FEATURE_MS2_VAR_SONAR_LAG FEATURE_MS2_VAR_SONAR_LOG_DIFF FEATURE_MS2_VAR_SONAR_LOG_SN FEATURE_MS2_VAR_SONAR_LOG_TREND FEATURE_MS2_VAR_SONAR_RSQ FEATURE_MS2_VAR_SONAR_SHAPE FEATURE_MS2_VAR_XCORR_COELUTION FEATURE_MS2_VAR_XCORR_COELUTION_WEIGHTED FEATURE_MS2_VAR_XCORR_SHAPE FEATURE_MS2_VAR_XCORR_SHAPE_WEIGHTED FEATURE_MS2_VAR_YSERIES_SCORE FEATURE_TRANSITION_APEX_INTENSITY FEATURE_TRANSITION_AREA_INTENSITY FEATURE_TRANSITION_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_TRANSITION_VAR_LOG_INTENSITY FEATURE_TRANSITION_VAR_LOG_SN_SCORE FEATURE_TRANSITION_VAR_MASSDEV_SCORE FEATURE_TRANSITION_VAR_XCORR_COELUTION FEATURE_TRANSITION_VAR_XCORR_SHAPE FILENAME GENE_DECOY GENE_ID GENE_NAME IM_leftWidth IM_rightWidth IPF_PEPTIDE_ID LEFT_WIDTH MODIFIED_SEQUENCE NORM_RT PEPTIDE_DECOY PEPTIDE_ID PRECURSOR_CHARGE PRECURSOR_DECOY PRECURSOR_GROUP_LABEL PRECURSOR_ID PRECURSOR_LIBRARY_DRIFT_TIME PRECURSOR_LIBRARY_INTENSITY PRECURSOR_LIBRARY_RT PRECURSOR_MZ PRECURSOR_TRAML_ID PRODUCT_MZ PROTEIN_ACCESSION PROTEIN_DECOY PROTEIN_ID RIGHT_WIDTH RUN_ID SCORE_MS2_PEAK_GROUP_RANK SCORE_MS2_PEP SCORE_MS2_P_VALUE SCORE_MS2_Q_VALUE SCORE_MS2_SCORE SCORE_PEPTIDE_GLOBAL_PEP SCORE_PEPTIDE_GLOBAL_P_VALUE SCORE_PEPTIDE_GLOBAL_Q_VALUE SCORE_PEPTIDE_GLOBAL_SCORE SCORE_PROTEIN_GLOBAL_PEP SCORE_PROTEIN_GLOBAL_P_VALUE SCORE_PROTEIN_GLOBAL_Q_VALUE SCORE_PROTEIN_GLOBAL_SCORE TRANSITION_CHARGE TRANSITION_DECOY TRANSITION_DETECTING TRANSITION_ID TRANSITION_LIBRARY_INTENSITY TRANSITION_ORDINAL TRANSITION_TRAML_ID TRANSITION_TYPE UNMODIFIED_SEQUENCE -0 None 65.9712 NaN 2661.55 -4.4095e+18 117220.7500 8.5464e+05 0.9835 0.1247 1.3707 0.0000 0.9907 30361.0 207283.0 9.0 0.7708 NaN 0.7811 0.9962 0.0000 0.9987 0.9978 0.0659 0.0239 0.0262 0.0725 4.7388 0.7451 0.3398 0.1793 0.0194 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9936 0.9958 11.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32.0 2640.5100 ADSTGTLVITDPTR(UniMod:267) 28.4379 False 33.0 2.0 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0.0 2705.3701 -8.6708e+18 1.0 0.0031 0.0029 0.0033 5.7301 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -1 None 10.1667 NaN 2605.74 2.6082e+17 8790.7812 1.0401e+05 0.9555 0.2667 5.4202 5.1430 0.6532 990.0 6385.0 2.0 0.7610 NaN 0.0241 0.9216 0.1104 0.8271 0.9764 0.2223 0.0995 0.1102 0.3579 1.3130 0.7675 4.5391 3.5103 0.0032 NaN NaN NaN NaN NaN NaN 7.0474 2.3104 0.7806 0.8341 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32.0 2575.6399 ADSTGTLVITDPTR(UniMod:267) 26.8198 False 33.0 2.0 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0.0 2623.4399 -8.6708e+18 2.0 1.0000 0.0674 0.0685 1.2404 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -2 None 237.1922 NaN 2832.77 5.1639e+18 10419.7432 2.4187e+05 0.6123 0.4707 8.9907 4.0083 0.5985 546.0 5180.0 2.0 0.7923 NaN 0.0195 0.8418 0.0911 0.9916 0.9960 0.0958 0.0387 0.0426 0.1243 0.6699 0.6863 4.7328 2.9948 0.0690 NaN NaN NaN NaN NaN NaN 4.3568 2.0950 0.6909 0.6974 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32.0 2811.2000 ADSTGTLVITDPTR(UniMod:267) 33.4026 False 33.0 2.0 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0.0 2855.5801 -8.6708e+18 5.0 1.0000 0.4692 0.4692 -0.1013 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -3 None 199.4846 NaN 2795.06 6.9329e+18 4036.5601 2.5862e+04 0.1872 2.4435 1.8505 4.0083 0.6422 497.0 2693.0 4.0 0.7883 NaN 0.0101 0.6804 0.1794 0.4554 0.9481 0.3084 0.1494 0.1882 0.6202 0.6284 0.6986 5.4811 3.8885 0.0581 NaN NaN NaN NaN NaN NaN 1.6487 0.9186 0.7955 0.7971 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32.0 2790.7200 ADSTGTLVITDPTR(UniMod:267) 32.3092 False 33.0 2.0 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0.0 2811.2000 -8.6708e+18 4.0 1.0000 0.1994 0.2018 0.6777 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -4 None 112.9550 NaN 2708.53 8.5342e+18 5750.4717 7.3215e+04 -0.3692 0.7498 7.1610 6.7500 0.4827 539.0 3838.0 3.0 0.8181 NaN 0.0145 0.7660 0.1334 0.8344 0.9736 0.2367 0.1055 0.1166 0.3772 0.6034 0.6468 2.5636 1.1471 0.0330 NaN NaN NaN NaN NaN NaN 3.4656 0.9347 0.6790 0.7379 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 32.0 2705.3701 ADSTGTLVITDPTR(UniMod:267) 29.8002 False 33.0 2.0 False NA 0 NaN NaN 26.5 728.8795 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA None 0.0 2736.0901 -8.6708e+18 3.0 1.0000 0.1994 0.2018 0.8151 0.0031 0.0029 0.0033 5.7301 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ADSTGTLVITDPTR -5 None -33.8921 NaN 2915.46 -7.7719e+18 184097.5000 1.3502e+06 0.9791 0.1077 0.4784 0.0000 0.9955 48041.0 356015.0 11.0 0.7672 NaN 0.7753 0.9926 0.0000 0.8733 0.9890 0.1414 0.0968 0.1030 0.2900 4.6343 0.7586 1.1505 1.3231 0.0100 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9987 0.9985 9.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79.0 2892.9299 ALGYEDATQALGR(UniMod:267) 35.8003 False 80.0 2.0 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0.0 2947.5500 -8.6708e+18 1.0 0.0031 0.0029 0.0033 5.6547 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -6 None 116.0990 NaN 3065.45 -1.6184e+18 2301.7830 1.0445e+05 -0.2527 3.0023 27.2973 6.2440 0.6421 1677.0 14023.0 8.0 0.7684 NaN 0.0305 0.4049 0.0000 0.9857 0.9992 0.0354 0.0206 0.0227 0.0631 1.2430 0.7016 4.4870 5.4277 0.0335 NaN NaN NaN NaN NaN NaN 0.5749 0.1143 0.7704 0.7332 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79.0 3043.1399 ALGYEDATQALGR(UniMod:267) 40.1495 False 80.0 2.0 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0.0 3087.5200 -8.6708e+18 2.0 1.0000 0.0674 0.0685 1.5182 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -7 None 159.3408 NaN 3108.69 -1.5285e+18 3486.5154 5.5320e+04 -0.4336 3.6979 7.6742 6.7217 0.6224 908.0 7875.0 4.0 0.5768 NaN 0.0171 0.6492 0.3228 0.7616 0.9878 0.1379 0.0796 0.0899 0.2518 0.7011 0.9173 6.5049 6.5553 0.0460 NaN NaN NaN NaN NaN NaN 5.2085 3.1141 0.8087 0.7578 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79.0 3087.5200 ALGYEDATQALGR(UniMod:267) 41.4033 False 80.0 2.0 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0.0 3125.0701 -8.6708e+18 5.0 1.0000 0.4692 0.4692 -1.2624 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -8 None -63.4234 NaN 2885.93 4.9324e+18 8025.5601 7.0488e+04 0.5051 1.5720 3.6487 4.9107 0.4855 1921.0 6866.0 5.0 0.3595 NaN 0.0150 0.4893 0.8436 -0.6868 0.8659 0.4973 0.3252 0.3576 0.9445 1.5314 1.1339 4.0154 5.9697 0.0186 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.8911 0.8734 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79.0 2869.0300 ALGYEDATQALGR(UniMod:267) 34.9440 False 80.0 2.0 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0.0 2889.5100 -8.6708e+18 4.0 1.0000 0.4692 0.4692 -1.1134 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR -9 None 288.8942 NaN 3238.25 9.1734e+18 3238.0957 6.5173e+04 -0.7411 2.1667 10.3701 9.0000 0.5089 1051.0 9306.0 6.0 0.7827 NaN 0.0203 0.5781 0.3886 0.8789 0.9890 0.1243 0.0848 0.0992 0.2789 0.8313 0.6932 2.6921 1.8259 0.0836 NaN NaN NaN NaN NaN NaN 4.2487 2.0195 0.7744 0.7170 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz None NaN None NaN NaN 79.0 3207.0100 ALGYEDATQALGR(UniMod:267) 45.1599 False 80.0 2.0 False AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.8480 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA None 0.0 3247.9700 -8.6708e+18 3.0 1.0000 0.4692 0.4692 -0.5351 0.0031 0.0029 0.0033 5.6547 0.3674 0.0625 0.0625 5.8401 NaN None None NaN NaN NaN None None ALGYEDATQALGR + ANNOTATION DELTA_RT EXP_IM EXP_RT FEATURE_ID FEATURE_MS1_APEX_INTENSITY FEATURE_MS1_AREA_INTENSITY FEATURE_MS1_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS1_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS1_VAR_MASSDEV_SCORE FEATURE_MS1_VAR_XCORR_COELUTION FEATURE_MS1_VAR_XCORR_SHAPE FEATURE_MS2_APEX_INTENSITY FEATURE_MS2_AREA_INTENSITY FEATURE_MS2_VAR_BSERIES_SCORE FEATURE_MS2_VAR_DOTPROD_SCORE FEATURE_MS2_VAR_ELUTION_MODEL_FIT_SCORE FEATURE_MS2_VAR_INTENSITY_SCORE FEATURE_MS2_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_MS2_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_MS2_VAR_LIBRARY_CORR FEATURE_MS2_VAR_LIBRARY_DOTPROD FEATURE_MS2_VAR_LIBRARY_MANHATTAN FEATURE_MS2_VAR_LIBRARY_RMSD FEATURE_MS2_VAR_LIBRARY_ROOTMEANSQUARE FEATURE_MS2_VAR_LIBRARY_SANGLE FEATURE_MS2_VAR_LOG_SN_SCORE FEATURE_MS2_VAR_MANHATTAN_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE FEATURE_MS2_VAR_MASSDEV_SCORE_WEIGHTED FEATURE_MS2_VAR_NORM_RT_SCORE FEATURE_MS2_VAR_SONAR_LAG FEATURE_MS2_VAR_SONAR_LOG_DIFF FEATURE_MS2_VAR_SONAR_LOG_SN FEATURE_MS2_VAR_SONAR_LOG_TREND FEATURE_MS2_VAR_SONAR_RSQ FEATURE_MS2_VAR_SONAR_SHAPE FEATURE_MS2_VAR_XCORR_COELUTION FEATURE_MS2_VAR_XCORR_COELUTION_WEIGHTED FEATURE_MS2_VAR_XCORR_SHAPE FEATURE_MS2_VAR_XCORR_SHAPE_WEIGHTED FEATURE_MS2_VAR_YSERIES_SCORE FEATURE_TRANSITION_APEX_INTENSITY FEATURE_TRANSITION_AREA_INTENSITY FEATURE_TRANSITION_VAR_ISOTOPE_CORRELATION_SCORE FEATURE_TRANSITION_VAR_ISOTOPE_OVERLAP_SCORE FEATURE_TRANSITION_VAR_LOG_INTENSITY FEATURE_TRANSITION_VAR_LOG_SN_SCORE FEATURE_TRANSITION_VAR_MASSDEV_SCORE FEATURE_TRANSITION_VAR_XCORR_COELUTION FEATURE_TRANSITION_VAR_XCORR_SHAPE FILENAME GENE_DECOY GENE_ID GENE_NAME IM_leftWidth IM_rightWidth IPF_PEPTIDE_ID LEFT_WIDTH MODIFIED_SEQUENCE NORM_RT PEPTIDE_DECOY PEPTIDE_ID PRECURSOR_CHARGE PRECURSOR_DECOY PRECURSOR_GROUP_LABEL PRECURSOR_ID PRECURSOR_LIBRARY_DRIFT_TIME PRECURSOR_LIBRARY_INTENSITY PRECURSOR_LIBRARY_RT PRECURSOR_MZ PRECURSOR_TRAML_ID PRODUCT_MZ PROTEIN_ACCESSION PROTEIN_DECOY PROTEIN_ID RIGHT_WIDTH RUN_ID SCORE_MS2_PEAK_GROUP_RANK SCORE_MS2_PEP SCORE_MS2_P_VALUE SCORE_MS2_Q_VALUE SCORE_MS2_SCORE SCORE_PEPTIDE_GLOBAL_PEP SCORE_PEPTIDE_GLOBAL_P_VALUE SCORE_PEPTIDE_GLOBAL_Q_VALUE SCORE_PEPTIDE_GLOBAL_SCORE SCORE_PROTEIN_GLOBAL_PEP SCORE_PROTEIN_GLOBAL_P_VALUE SCORE_PROTEIN_GLOBAL_Q_VALUE SCORE_PROTEIN_GLOBAL_SCORE TRANSITION_CHARGE TRANSITION_DECOY TRANSITION_DETECTING TRANSITION_ID TRANSITION_LIBRARY_INTENSITY TRANSITION_ORDINAL TRANSITION_TRAML_ID TRANSITION_TYPE UNMODIFIED_SEQUENCE +0 None 65.971 NaN 2661.55 -4.4095e+18 117220.748 854645.0 0.9834 0.1247 1.3700 0.000 0.9906 30361.0 207283.0 9.0 0.7707 NaN 0.7811 0.9961 0.0000 0.9987 0.9978 0.0659 0.0239 0.0262 0.0725 4.7380 0.7450 0.3397 0.1793 0.0194 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9936 0.9958 11.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32.0 2640.510 ADSTGTLVITDPTR(UniMod:267) 28.437 0.0 33.0 2.0 0.0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0.0 2705.370 -8.6708e+18 1.0 0.0031 0.0029 0.0033 5.7300 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ADSTGTLVITDPTR +1 None 10.166 NaN 2605.74 2.6082e+17 8790.781 104006.0 0.9554 0.2666 5.4200 5.142 0.6532 990.0 6385.0 2.0 0.7609 NaN 0.0241 0.9216 0.1104 0.8270 0.9764 0.2222 0.0995 0.1101 0.3578 1.3130 0.7674 4.5390 3.5100 0.0032 NaN NaN NaN NaN NaN NaN 7.0470 2.3100 0.7805 0.8341 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32.0 2575.639 ADSTGTLVITDPTR(UniMod:267) 26.819 0.0 33.0 2.0 0.0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0.0 2623.439 -8.6708e+18 2.0 1.0000 0.0674 0.0684 1.2400 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ADSTGTLVITDPTR +2 None 237.192 NaN 2832.77 5.1639e+18 10419.743 241873.0 0.6122 0.4706 8.9900 4.008 0.5985 546.0 5180.0 2.0 0.7922 NaN 0.0195 0.8417 0.0911 0.9915 0.9959 0.0958 0.0387 0.0426 0.1243 0.6698 0.6862 4.7320 2.9940 0.0690 NaN NaN NaN NaN NaN NaN 4.3560 2.0940 0.6909 0.6974 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32.0 2811.199 ADSTGTLVITDPTR(UniMod:267) 33.402 0.0 33.0 2.0 0.0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0.0 2855.580 -8.6708e+18 5.0 1.0000 0.4692 0.4692 -0.1013 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ADSTGTLVITDPTR +3 None 199.484 NaN 2795.06 6.9329e+18 4036.559 25862.3 0.1872 2.4430 1.8500 4.008 0.6421 497.0 2693.0 4.0 0.7883 NaN 0.0101 0.6803 0.1793 0.4553 0.9481 0.3083 0.1493 0.1881 0.6202 0.6284 0.6985 5.4810 3.8880 0.0581 NaN NaN NaN NaN NaN NaN 1.6480 0.9185 0.7954 0.7970 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32.0 2790.719 ADSTGTLVITDPTR(UniMod:267) 32.309 0.0 33.0 2.0 0.0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0.0 2811.199 -8.6708e+18 4.0 1.0000 0.1994 0.2017 0.6777 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ADSTGTLVITDPTR +4 None 112.954 NaN 2708.53 8.5342e+18 5750.471 73215.2 -0.3691 0.7498 7.1610 6.750 0.4827 539.0 3838.0 3.0 0.8180 NaN 0.0145 0.7659 0.1334 0.8343 0.9736 0.2367 0.1054 0.1165 0.3772 0.6034 0.6467 2.5630 1.1470 0.0330 NaN NaN NaN NaN NaN NaN 3.4650 0.9346 0.6790 0.7379 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 32.0 2705.370 ADSTGTLVITDPTR(UniMod:267) 29.800 0.0 33.0 2.0 0.0 NA 0 NaN NaN 26.5 728.879 AQUA4SWATH_HMLangeA_ADSTGTLVITDPTR(Label:13C(6... NaN AQUA4SWATH_HMLangeA NaN 0.0 2736.090 -8.6708e+18 3.0 1.0000 0.1994 0.2017 0.8151 0.0031 0.0029 0.0033 5.730 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ADSTGTLVITDPTR +5 None -33.892 NaN 2915.46 -7.7719e+18 184097.499 1350180.0 0.9790 0.1076 0.4783 0.000 0.9955 48041.0 356015.0 11.0 0.7671 NaN 0.7752 0.9925 0.0000 0.8732 0.9889 0.1414 0.0968 0.1029 0.2899 4.6340 0.7585 1.1500 1.3230 0.0100 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.9986 0.9984 9.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79.0 2892.929 ALGYEDATQALGR(UniMod:267) 35.800 0.0 80.0 2.0 0.0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0.0 2947.550 -8.6708e+18 1.0 0.0031 0.0029 0.0033 5.6540 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ALGYEDATQALGR +6 None 116.099 NaN 3065.45 -1.6184e+18 2301.782 104447.0 -0.2526 3.0020 27.2970 6.244 0.6421 1677.0 14023.0 8.0 0.7683 NaN 0.0305 0.4049 0.0000 0.9857 0.9992 0.0354 0.0206 0.0227 0.0631 1.2420 0.7015 4.4870 5.4270 0.0335 NaN NaN NaN NaN NaN NaN 0.5749 0.1142 0.7703 0.7332 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79.0 3043.139 ALGYEDATQALGR(UniMod:267) 40.149 0.0 80.0 2.0 0.0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0.0 3087.520 -8.6708e+18 2.0 1.0000 0.0674 0.0684 1.5180 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ALGYEDATQALGR +7 None 159.340 NaN 3108.69 -1.5285e+18 3486.515 55319.7 -0.4335 3.6970 7.6740 6.721 0.6223 908.0 7875.0 4.0 0.5767 NaN 0.0171 0.6492 0.3227 0.7615 0.9878 0.1378 0.0796 0.0899 0.2518 0.7010 0.9173 6.5040 6.5550 0.0460 NaN NaN NaN NaN NaN NaN 5.2080 3.1140 0.8087 0.7577 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79.0 3087.520 ALGYEDATQALGR(UniMod:267) 41.403 0.0 80.0 2.0 0.0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0.0 3125.070 -8.6708e+18 5.0 1.0000 0.4692 0.4692 -1.2620 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ALGYEDATQALGR +8 None -63.423 NaN 2885.93 4.9324e+18 8025.560 70488.3 0.5050 1.5710 3.6480 4.910 0.4855 1921.0 6866.0 5.0 0.3594 NaN 0.0149 0.4893 0.8435 -0.6868 0.8658 0.4972 0.3251 0.3575 0.9444 1.5310 1.1330 4.0150 5.9690 0.0186 NaN NaN NaN NaN NaN NaN 0.0000 0.0000 0.8911 0.8734 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79.0 2869.030 ALGYEDATQALGR(UniMod:267) 34.944 0.0 80.0 2.0 0.0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0.0 2889.510 -8.6708e+18 4.0 1.0000 0.4692 0.4692 -1.1130 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ALGYEDATQALGR +9 None 288.894 NaN 3238.25 9.1734e+18 3238.095 65173.1 -0.7410 2.1660 10.3700 9.000 0.5089 1051.0 9306.0 6.0 0.7827 NaN 0.0203 0.5781 0.3885 0.8789 0.9890 0.1242 0.0848 0.0992 0.2788 0.8313 0.6931 2.6920 1.8250 0.0836 NaN NaN NaN NaN NaN NaN 4.2480 2.0190 0.7743 0.7169 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN napedro_L120420_010_SW.mzXML.gz NaN NaN NaN NaN NaN 79.0 3207.010 ALGYEDATQALGR(UniMod:267) 45.159 0.0 80.0 2.0 0.0 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(UniMod:267)/2 1 NaN NaN 36.8 687.848 AQUA4SWATH_HMLangeA_ALGYEDATQALGR(Label:13C(6)... NaN AQUA4SWATH_HMLangeA NaN 0.0 3247.969 -8.6708e+18 3.0 1.0000 0.4692 0.4692 -0.5351 0.0031 0.0029 0.0033 5.654 0.3674 0.0625 0.0625 5.84 NaN NaN NaN NaN NaN NaN None None ALGYEDATQALGR diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_with_ipf.out b/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_with_ipf.out index 5ed4c2d2..e989ee5a 100644 --- a/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_with_ipf.out +++ b/tests/_regtest_outputs/test_pyprophet_export.test_parquet_export_with_ipf.out @@ -2,13 +2,13 @@ Exported 97964 rows with 105 columns SCORE_IPF columns found: ['SCORE_IPF_PEP', 'SCORE_IPF_PRECURSOR_PEAKGROUP_PEP', 'SCORE_IPF_QVALUE'] Sample data with IPF scores: FEATURE_ID SCORE_IPF_PEP SCORE_IPF_PRECURSOR_PEAKGROUP_PEP SCORE_IPF_QVALUE -0 4.8397e+17 0.0000e+00 2.1927e-07 0.0000e+00 -1 1.0824e+18 0.0000e+00 9.9998e-08 0.0000e+00 -2 -1.1854e+18 0.0000e+00 4.5323e-08 0.0000e+00 -3 6.8070e+18 0.0000e+00 1.5978e-09 0.0000e+00 -4 7.1486e+18 0.0000e+00 1.1662e-08 0.0000e+00 -5 9.0780e+17 2.5734e-10 1.6434e-04 9.0990e-13 -6 2.4200e+18 0.0000e+00 1.7179e-08 0.0000e+00 -7 -1.4753e+18 0.0000e+00 1.5978e-09 0.0000e+00 -8 5.4169e+18 0.0000e+00 4.0794e-08 0.0000e+00 -9 -3.0355e+17 0.0000e+00 2.0475e-08 0.0000e+00 +0 -4.4095e+18 0.0 6.0480e-09 0.0 +1 -7.7719e+18 0.0 7.0700e-10 0.0 +2 -7.9773e+17 0.0 1.1660e-08 0.0 +3 -1.7329e+18 0.0 7.0700e-10 0.0 +4 -6.7478e+18 0.0 7.0700e-10 0.0 +5 -7.1577e+18 0.0 1.2900e-08 0.0 +6 9.0905e+17 0.0 1.4230e-08 0.0 +7 4.7208e+18 0.0 5.6690e-06 0.0 +8 3.9209e+18 0.0 1.1660e-08 0.0 +9 -5.4563e+18 0.0 6.0480e-09 0.0 diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py index 43d7f0ca..3e1e0f5c 100644 --- a/tests/test_pyprophet_export.py +++ b/tests/test_pyprophet_export.py @@ -367,7 +367,7 @@ def test_parquet_export_scored_osw(test_data_osw, temp_folder, regtest): df = sort_parquet_export_frame(df) print(f"Exported {len(df)} rows with {len(df.columns)} columns", file=regtest) print(f"Score columns found: {sorted(score_columns)}", file=regtest) - print(df.head(10).sort_index(axis=1), file=regtest) + print(_normalize_regtest_frame(df, head=10), file=regtest) def test_parquet_export_no_transition_data(test_data_osw, temp_folder, regtest): @@ -412,7 +412,7 @@ def test_parquet_export_no_transition_data(test_data_osw, temp_folder, regtest): file=regtest, ) print(f"Score columns found: {sorted(score_columns)}", file=regtest) - print(df.head(10).sort_index(axis=1), file=regtest) + print(_normalize_regtest_frame(df, head=10), file=regtest) def test_parquet_export_split_format(test_data_osw, temp_folder, regtest): @@ -513,11 +513,17 @@ def test_parquet_export_with_ipf(test_data_osw, temp_folder, regtest): expected_ipf_columns = ['SCORE_IPF_PRECURSOR_PEAKGROUP_PEP', 'SCORE_IPF_PEP', 'SCORE_IPF_QVALUE'] for col in expected_ipf_columns: assert col in df.columns, f"Expected column {col} not found in exported parquet" + + df = sort_parquet_export_frame(df) + ipf_sample = df.loc[ + df[ipf_columns].notna().any(axis=1), + ['FEATURE_ID'] + ipf_columns, + ] print(f"Exported {len(df)} rows with {len(df.columns)} columns", file=regtest) print(f"SCORE_IPF columns found: {sorted(ipf_columns)}", file=regtest) print("Sample data with IPF scores:", file=regtest) - print(df[['FEATURE_ID'] + ipf_columns].head(10).sort_index(axis=1), file=regtest) + print(_normalize_regtest_frame(ipf_sample, head=10), file=regtest) # ================== FEATURE SCORES EXPORT TESTS ==================