"""Main functions for pysec2pri.
This module provides functions for parsing biological database
secondary-to-primary mapping files and generating and using the standardized Mapping sets.
"""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING, Any
from pysec2pri.exports import (
write_json,
write_name2synonym,
write_output,
write_owl,
write_pri_ids,
write_rdf,
write_sec2pri,
write_sssom,
)
from pysec2pri.exports import (
write_symbol2prev as write_symbol_sec2pri,
)
if TYPE_CHECKING:
import pandas as pd
from pysec2pri.diff import MappingDiff
from pysec2pri.parsers.base import Sec2PriMappingSet
from pysec2pri.parsers.base import AmbiguousMappingSet, IdMappingSet, LabelMappingSet
__all__ = [
"combine_mapping_sets",
"find_ambiguous",
"generate_chebi",
"generate_chebi_primary_ids",
"generate_chebi_primary_symbols",
"generate_chebi_synonyms",
"generate_hgnc",
"generate_hgnc_primary_ids",
"generate_hgnc_symbols",
"generate_hmdb",
"generate_hmdb_primary_ids",
"generate_hmdb_proteins",
"generate_ncbi",
"generate_ncbi_primary_ids",
"generate_ncbi_primary_symbols",
"generate_ncbi_symbols",
"generate_uniprot",
"generate_uniprot_primary_ids",
"generate_wikidata",
"generate_wikidata_symbols",
"list_versions",
"load_label_mapping",
"load_mapping",
"resolve_ids",
"resolve_symbols",
"save",
"write_all_formats",
"write_diff_output",
"write_json",
"write_name2synonym",
"write_output",
"write_owl",
"write_rdf",
"write_sec2pri",
"write_sssom",
"write_symbol_sec2pri",
]
def _auto_download(
datasource: str,
version: str | None = None,
keys: list[str] | None = None,
) -> dict[str, Path]:
"""Download files for *datasource* into a temp dir.
Args:
datasource: Datasource name (e.g. ``"hgnc"``).
version: Optional specific version to download.
keys: Optional list of file-key names to download. When given, only
those keys are fetched (e.g. ``["complete"]``).
"""
import tempfile
from pysec2pri.download import download_datasource
tmpdir = Path(tempfile.mkdtemp(prefix=f"pysec2pri_{datasource}_"))
return download_datasource(datasource, tmpdir, version=version, keys=keys)
[docs]
def generate_chebi(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
subset: str = "3star",
mapping_sets: str = "ids",
) -> Sec2PriMappingSet:
"""Return ChEBI mappings (IDs, synonyms, or both).
Downloads the latest release automatically when ``input_path`` is omitted.
Pass an SDF file (releases < 245) or a directory of TSV flat files
(releases >= 245) to use a local copy.
Args:
input_path: Local SDF file or TSV directory. Auto-downloaded if ``None``.
version: Release number (e.g. ``"245"``).
show_progress: Whether to show progress bars.
subset: ``"3star"`` (default) or ``"complete"``.
mapping_sets: ``"ids"`` (default), ``"synonyms"``, or ``"all"``.
"""
import tempfile
from pysec2pri.download import check_chebi_release
from pysec2pri.parsers import ChEBIParser
from pysec2pri.parsers.chebi import ChEBIDownloader
if input_path is None:
if version is None:
version = check_chebi_release().version or "245"
downloader = ChEBIDownloader(version=version, subset=subset)
tmpdir = Path(tempfile.mkdtemp(prefix=f"pysec2pri_chebi_{version}_"))
downloader.download(tmpdir, version=version)
input_path = tmpdir
parser = ChEBIParser(version=version, show_progress=show_progress, subset=subset)
if mapping_sets == "synonyms":
return parser.parse_synonyms(Path(input_path))
if mapping_sets == "all":
ids = parser.parse(Path(input_path))
syns = parser.parse_synonyms(Path(input_path))
return combine_mapping_sets(ids, syns)
return parser.parse(Path(input_path))
[docs]
def generate_chebi_synonyms(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
subset: str = "3star",
) -> Sec2PriMappingSet:
"""Return ChEBI synonym (name) mappings."""
return generate_chebi(
input_path=input_path,
version=version,
show_progress=show_progress,
subset=subset,
mapping_sets="synonyms",
)
[docs]
def generate_hgnc(
input_path: Path | str | None = None,
complete_set_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return HGNC secondary to primary ID mappings.
Downloads the withdrawn and complete set files automatically when
``input_path`` / ``complete_set_path`` are omitted. The complete set is
used to populate the full list of current primary IDs so that
:meth:`~pysec2pri.parsers.base.Sec2PriMappingSet.to_pri_ids` returns the
authoritative list (~45 k IDs) rather than just the ~5 k primaries that
happen to have a secondary.
Args:
input_path: Local HGNC withdrawn TSV. Auto-downloaded if ``None``.
complete_set_path: Local HGNC complete set TSV. Auto-downloaded if
``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HGNCParser
if input_path is None or complete_set_path is None:
files = _auto_download("hgnc", version)
if input_path is None:
input_path = files["withdrawn"]
if complete_set_path is None:
complete_set_path = files["complete"]
parser = HGNCParser(version=version, show_progress=show_progress)
return parser.parse(Path(input_path), complete_set_path=Path(complete_set_path))
[docs]
def generate_hgnc_primary_ids(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current HGNC primary IDs.
Only the HGNC complete set file is downloaded/read. The returned mapping
set has an empty ``mappings`` list; its ``_primary_ids`` store is
populated with every current HGNC ID so that ``to_pri_ids()`` produces
the authoritative complete list, not just the subset of primaries that
happen to have an associated secondary.
Args:
input_path: Local HGNC complete set TSV. Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HGNCParser
if input_path is None:
input_path = _auto_download("hgnc", version, keys=["complete"])["complete"]
parser = HGNCParser(version=version, show_progress=show_progress)
return parser.parse_primary_ids(Path(input_path))
def generate_hgnc_primary_symbols(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current HGNC primary Symbols.
Only the HGNC complete set file is downloaded/read. The returned mapping
set has an empty ``mappings`` list; its ``_primary_symbols`` store is
populated with every current HGNC Symbol so that ``to_pri_symbols()`` produces
the authoritative complete list, not just the subset of primaries that
happen to have an associated secondary.
Args:
input_path: Local HGNC complete set TSV. Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HGNCParser
if input_path is None:
input_path = _auto_download("hgnc", version, keys=["complete"])["complete"]
parser = HGNCParser(version=version, show_progress=show_progress)
return parser.parse_primary_symbols(Path(input_path))
[docs]
def generate_hgnc_symbols(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
statuses: list[str] | None = None,
) -> Sec2PriMappingSet:
"""Return HGNC symbol to previous-symbol mappings.
Downloads the complete set file automatically when ``input_path`` is omitted.
Args:
input_path: Local HGNC complete set TSV. Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
statuses: Entry statuses to include (e.g. ``["Approved"]``).
"""
from pysec2pri.parsers import HGNCParser
if input_path is None:
input_path = _auto_download("hgnc", version)["complete"]
parser = HGNCParser(version=version, show_progress=show_progress)
return parser.parse_symbols(Path(input_path), statuses=statuses)
[docs]
def generate_chebi_primary_ids(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
subset: str = "3star",
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current ChEBI primary IDs.
Reads ``compounds.tsv`` to extract every current ChEBI compound ID.
The returned mapping set has an empty ``mappings`` list; ``_primary_ids``
is populated with every current ``CHEBI:<n>`` CURIE.
Args:
input_path: Local ``compounds.tsv`` file or directory containing it.
Auto-downloaded if ``None``.
version: Release number (e.g. ``"245"``).
show_progress: Whether to show progress bars.
subset: ``"3star"`` (default) or ``"complete"``.
"""
import tempfile
from pysec2pri.download import check_chebi_release
from pysec2pri.parsers.chebi import ChEBIDownloader, ChEBIParser
if input_path is None:
if version is None:
version = check_chebi_release().version or "245"
downloader = ChEBIDownloader(version=version, subset=subset)
tmpdir = Path(tempfile.mkdtemp(prefix=f"pysec2pri_chebi_{version}_"))
downloader.download(tmpdir, version=version, keys=["compounds"])
input_path = tmpdir
parser = ChEBIParser(version=version, show_progress=show_progress, subset=subset)
return parser.parse_primary_ids(Path(input_path))
[docs]
def generate_chebi_primary_symbols(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
subset: str = "3star",
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current ChEBI compound names.
Reads ``compounds.tsv`` to extract every current compound's canonical name.
The returned mapping set has an empty ``mappings`` list; ``_primary_symbols``
is populated.
Args:
input_path: Local ``compounds.tsv`` file or directory containing it.
Auto-downloaded if ``None``.
version: Release number (e.g. ``"245"``).
show_progress: Whether to show progress bars.
subset: ``"3star"`` (default) or ``"complete"``.
"""
import tempfile
from pysec2pri.download import check_chebi_release
from pysec2pri.parsers.chebi import ChEBIDownloader, ChEBIParser
if input_path is None:
if version is None:
version = check_chebi_release().version or "245"
downloader = ChEBIDownloader(version=version, subset=subset)
tmpdir = Path(tempfile.mkdtemp(prefix=f"pysec2pri_chebi_{version}_"))
downloader.download(tmpdir, version=version, keys=["compounds"])
input_path = tmpdir
parser = ChEBIParser(version=version, show_progress=show_progress, subset=subset)
return parser.parse_primary_symbols(Path(input_path))
[docs]
def generate_ncbi_primary_ids(
input_path: Path | str | None = None,
tax_id: str = "9606",
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current NCBI Gene primary IDs.
Reads ``gene_info`` to extract every current Gene ID for the given taxonomy.
The returned mapping set has an empty ``mappings`` list; ``_primary_ids``
is populated with every current ``NCBIGene:<id>`` CURIE.
Args:
input_path: Local gene_info file. Auto-downloaded if ``None``.
tax_id: Taxonomy ID to filter by (default: ``"9606"`` for human).
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import NCBIParser
if input_path is None:
input_path = _auto_download("ncbi", version, keys=["gene_info"])["gene_info"]
parser = NCBIParser(version=version, show_progress=show_progress)
return parser.parse_primary_ids(Path(input_path), tax_id=tax_id)
[docs]
def generate_ncbi_primary_symbols(
input_path: Path | str | None = None,
tax_id: str = "9606",
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current NCBI Gene symbols.
Reads ``gene_info`` to extract every current gene symbol for the given
taxonomy. The returned mapping set has an empty ``mappings`` list;
``_primary_symbols`` is populated.
Args:
input_path: Local gene_info file. Auto-downloaded if ``None``.
tax_id: Taxonomy ID to filter by (default: ``"9606"`` for human).
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import NCBIParser
if input_path is None:
input_path = _auto_download("ncbi", version, keys=["gene_info"])["gene_info"]
parser = NCBIParser(version=version, show_progress=show_progress)
return parser.parse_primary_symbols(Path(input_path), tax_id=tax_id)
[docs]
def generate_hmdb_primary_ids(
metabolites_path: Path | str | None = None,
proteins_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current HMDB primary IDs.
Reads one or both of ``hmdb_metabolites.xml`` and ``hmdb_proteins.xml``
and collects all primary accession numbers. The returned mapping set has
an empty ``mappings`` list; ``_primary_ids`` is populated with every
current ``HMDB:<acc>`` CURIE.
Args:
metabolites_path: Local metabolites XML file. Auto-downloaded if both
paths are ``None``.
proteins_path: Local proteins XML file (optional).
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers.hmdb import HMDBParser
if metabolites_path is None and proteins_path is None:
metabolites_path = _auto_download("hmdb", version, keys=["metabolites"])["metabolites"]
parser = HMDBParser(version=version, show_progress=show_progress)
return parser.parse_primary_ids(
metabolites_path=metabolites_path,
proteins_path=proteins_path,
)
[docs]
def generate_uniprot_primary_ids(
acindex_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current UniProt primary ACs.
Parses ``acindex.txt`` to extract every accession number that currently
appears in UniProtKB/Swiss-Prot. The returned mapping set has an empty
``mappings`` list; ``_primary_ids`` is populated with every current
``UniProtKB:<AC>`` CURIE.
For versioned (legacy) releases the file is available at::
https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/
release-{version}/knowledgebase/docs/acindex.txt.gz
Args:
acindex_path: Local ``acindex.txt`` (plain or ``.gz``).
Auto-downloaded from the current release when ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers.uniprot import UniProtParser
if acindex_path is None:
acindex_path = _auto_download("uniprot", version, keys=["acindex"])["acindex"]
parser = UniProtParser(version=version, show_progress=show_progress)
return parser.parse_primary_ids(Path(acindex_path))
[docs]
def generate_ncbi(
input_path: Path | str | None = None,
gene_info_path: Path | str | None = None,
tax_id: str = "9606",
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return NCBI Gene secondary to primary ID mappings.
Downloads the gene_history file automatically when ``input_path`` is
omitted. When ``gene_info_path`` is supplied (or auto-downloaded), the
full list of current primary IDs is read from ``gene_info`` and stored in
``_primary_ids``, so that :meth:`~pysec2pri.parsers.base.Sec2PriMappingSet.to_pri_ids`
returns the authoritative complete set rather than only the subset of
primaries that happen to appear in ``gene_history``.
Args:
input_path: Local gene_history file. Auto-downloaded if ``None``.
gene_info_path: Local gene_info file used to populate the full primary
ID list. Auto-downloaded together with ``input_path`` when both
are ``None``.
tax_id: NCBI taxonomy ID to filter (default: ``"9606"`` for human).
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import NCBIParser
if input_path is None or gene_info_path is None:
files = _auto_download("ncbi", version)
if input_path is None:
input_path = files["gene_history"]
if gene_info_path is None:
gene_info_path = files["gene_info"]
parser = NCBIParser(version=version, show_progress=show_progress)
return parser.parse(Path(input_path), tax_id=tax_id, gene_info_path=Path(gene_info_path))
[docs]
def generate_ncbi_symbols(
input_path: Path | str | None = None,
tax_id: str = "9606",
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return NCBI Gene symbol to previous-symbol mappings.
Downloads the gene_info file automatically when ``input_path`` is omitted.
Args:
input_path: Local gene_info file. Auto-downloaded if ``None``.
tax_id: NCBI taxonomy ID to filter (default: ``"9606"`` for human).
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import NCBIParser
if input_path is None:
input_path = _auto_download("ncbi", version)["gene_info"]
parser = NCBIParser(version=version, show_progress=show_progress)
return parser.parse_symbols(Path(input_path), tax_id=tax_id)
[docs]
def generate_uniprot(
input_path: Path | str | None = None,
delac_file: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return UniProt secondary to primary accession mappings.
Downloads sec_ac.txt and delac_sp.txt automatically when ``input_path``
is omitted.
Args:
input_path: Local sec_ac.txt. Auto-downloaded if ``None``.
delac_file: Local delac_sp.txt.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import UniProtParser
if input_path is None:
files = _auto_download("uniprot", version)
input_path = files.get("sec_ac") or next(iter(files.values()))
if delac_file is None:
delac_file = files.get("delac_sp")
parser = UniProtParser(version=version, show_progress=show_progress)
return parser.parse(
Path(input_path),
delac_path=Path(delac_file) if delac_file else None,
)
[docs]
def generate_hmdb(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return HMDB metabolite secondary to primary accession mappings.
Downloads hmdb_metabolites.xml automatically when ``input_path`` is omitted.
Args:
input_path: Local hmdb_metabolites.xml (or .zip/.gz). Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HMDBParser
if input_path is None:
input_path = _auto_download("hmdb", version)["metabolites"]
parser = HMDBParser(version=version, show_progress=show_progress)
return parser.parse(Path(input_path))
[docs]
def generate_hmdb_proteins(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return HMDB protein secondary to primary accession mappings.
Downloads hmdb_proteins.xml automatically when ``input_path`` is omitted.
Args:
input_path: Local hmdb_proteins.xml (or .zip/.gz). Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HMDBParser
if input_path is None:
input_path = _auto_download("hmdb", version)["proteins"]
parser = HMDBParser(version=version, show_progress=show_progress)
return parser.parse_proteins(Path(input_path))
[docs]
def generate_wikidata(
input_path: Path | str | None = None,
entity_type: str | None = None,
version: str | None = None,
endpoint: str | None = None,
show_progress: bool = True,
test_subset: bool = False,
) -> Sec2PriMappingSet:
"""Return Wikidata redirect mappings via SPARQL (or a pre-downloaded TSV).
Queries the QLever Wikidata endpoint when ``input_path`` is omitted.
If ``entity_type`` is ``None``, all entity types (metabolites, genes,
proteins) are queried and combined.
Args:
input_path: Pre-downloaded TSV file. Queries SPARQL if ``None``.
entity_type: ``"metabolites"``, ``"chemicals"``, ``"genes"``, or
`"proteins"``. Queries all types when ``None``.
version: Version string for metadata (defaults to today's date).
endpoint: Custom SPARQL endpoint URL.
show_progress: Whether to show progress bars.
test_subset: Use test queries limited to 10 results.
"""
from pysec2pri.parsers import WikidataParser
parser = WikidataParser(
version=version,
show_progress=show_progress,
entity_type=entity_type or "metabolites",
endpoint=endpoint,
test_subset=test_subset,
)
if input_path is not None:
return parser.parse_from_file(Path(input_path))
if entity_type is None:
return parser.parse_all()
return parser.parse()
[docs]
def generate_wikidata_symbols(
input_path: Path | str | None = None,
entity_type: str | None = None,
version: str | None = None,
endpoint: str | None = None,
show_progress: bool = True,
test_subset: bool = False,
) -> LabelMappingSet:
"""Return Wikidata label mappings (previous label to current label).
Queries the QLever Wikidata endpoint when ``input_path`` is omitted.
If ``entity_type`` is ``None``, all entity types are queried and
their label mappings combined.
Args:
input_path: Pre-downloaded TSV file. Queries SPARQL if ``None``.
entity_type: ``"metabolites"``, ``"chemicals"``, ``"genes"``, or
``"proteins"``. Queries all types when ``None``.
version: Version string for metadata.
endpoint: Custom SPARQL endpoint URL.
show_progress: Whether to show progress bars.
test_subset: Use test queries limited to 10 results.
Returns:
:class:`~pysec2pri.parsers.base.LabelMappingSet` with label mappings.
"""
from pysec2pri.parsers import WikidataParser
entity_types = ["metabolites", "genes", "proteins"] if entity_type is None else [entity_type]
sets = []
for etype in entity_types:
parser = WikidataParser(
version=version,
show_progress=show_progress,
entity_type=etype,
endpoint=endpoint,
test_subset=test_subset,
)
sets.append(parser.parse_symbols(Path(input_path) if input_path else None))
if len(sets) == 1:
return sets[0]
# Combine multiple LabelMappingSets into one
all_mappings = [m for ms in sets for m in (ms.mappings or [])]
combined = sets[0]
combined.mappings = all_mappings
combined.compute_cardinalities()
return combined
[docs]
def combine_mapping_sets(
id_mappings: Sec2PriMappingSet | None,
synonym_mappings: Sec2PriMappingSet | None,
) -> Sec2PriMappingSet:
"""Combine two mapping sets into one.
Args:
id_mappings: First mapping set (e.g. ID mappings).
synonym_mappings: Second mapping set (e.g. synonym mappings).
Returns:
Combined mapping set.
Raises:
ValueError: If both mapping sets are ``None``.
"""
if id_mappings is None and synonym_mappings is None:
msg = "At least one mapping set must be provided"
raise ValueError(msg)
if id_mappings is None:
return synonym_mappings # type: ignore[return-value]
if synonym_mappings is None:
return id_mappings
combined = list(id_mappings.mappings or [])
combined.extend(synonym_mappings.mappings or [])
id_mappings.mappings = combined
return id_mappings
# Output helpers
_FORMAT_EXTENSIONS: dict[str, str] = {
"rdf": ".ttl",
"owl": "_owl.ttl",
"json": ".json",
}
def _output_filename(base_name: str, fmt: str) -> Path:
"""Return the default filename for *base_name* in format *fmt*."""
ext = _FORMAT_EXTENSIONS.get(fmt, ".tsv")
if fmt == "owl":
return Path(f"{base_name}{ext}")
return Path(f"{base_name}_{fmt}{ext}")
[docs]
def save(
mapping_set: Sec2PriMappingSet,
output_format: str,
output: Path | str | None = None,
*,
base_name: str,
) -> Path:
"""Write *mapping_set* and return the path that was written.
Delegates to :meth:`~pysec2pri.parsers.base.Sec2PriMappingSet.save` for
single formats and :func:`write_all_formats` for ``"all"``.
Args:
mapping_set: The mapping set to write.
output_format: One of ``sssom``, ``sec2pri``, ``pri_ids``,
``name2synonym``, ``symbol_sec2pri``, ``pri_symbols``,
``rdf``, ``json``, ``owl``, or ``all``.
output: Explicit output path or directory. When ``None``, a
default name derived from *base_name* is used.
base_name: Stem used to derive file names, e.g. ``"hgnc_2026-04-07"``.
Returns:
The directory (for ``"all"``) or file path that was written.
"""
out = Path(output) if output else None
if output_format == "all":
if out is None:
out_dir = Path(base_name)
elif out.suffix:
out_dir = out.parent / base_name
else:
out_dir = out
write_all_formats(mapping_set, out_dir, base_name)
return out_dir
# Resolve output path, then delegate to the mapping-set method
if out is None:
out_path = _output_filename(base_name, output_format)
elif out.is_dir():
out_path = out / _output_filename(base_name, output_format).name
else:
out_path = out
return mapping_set.save(output_format, out_path)
[docs]
def write_diff_output(
result: MappingDiff,
output_path: Path,
) -> None:
"""Write diff results to a TSV file.
Args:
result: MappingDiff object with added/removed/changed mappings.
output_path: Path to write the TSV file.
"""
import polars as pl
dfs = []
if result.added_count > 0:
added_df = result.added.with_columns(
pl.lit("added").alias("change_type"),
pl.lit(None).alias("old_subject_id"),
).select(
[
"change_type",
"object_id",
pl.col("subject_id").alias("new_subject_id"),
"old_subject_id",
]
)
dfs.append(added_df)
if result.removed_count > 0:
removed_df = result.removed.with_columns(
pl.lit("removed").alias("change_type"),
pl.lit(None).alias("new_subject_id"),
).select(
[
"change_type",
"object_id",
"new_subject_id",
pl.col("subject_id").alias("old_subject_id"),
]
)
dfs.append(removed_df)
if result.changed_count > 0:
changed_df = result.changed.with_columns(
pl.lit("changed").alias("change_type"),
).select(
[
"change_type",
"object_id",
"new_subject_id",
"old_subject_id",
]
)
dfs.append(changed_df)
if dfs:
combined = pl.concat(dfs)
combined.write_csv(output_path, separator="\t")
[docs]
def load_mapping(path: Path | str) -> IdMappingSet:
"""Load an ID mapping set from a pysec2pri TSV file.
Accepts the ``sec2pri`` TSV format (columns ``subject_id``, ``object_id``,
``predicate_id``, ``mapping_cardinality``) and the full SSSOM TSV format
(comment-prefixed metadata lines are skipped automatically).
Args:
path: Path to the TSV file to load.
Returns:
An :class:`~pysec2pri.parsers.base.IdMappingSet` populated from the
file, ready to pass to :func:`resolve_ids`.
"""
import pandas as pd
from sssom_schema import Mapping
path = Path(path)
df = pd.read_csv(path, sep="\t", dtype=str, comment="#")
ms = IdMappingSet(
mapping_set_id=str(path),
license="https://creativecommons.org/licenses/by/4.0/",
)
mappings: list[Mapping] = []
for _, row in df.iterrows():
m = Mapping(
subject_id=row.get("subject_id") or "",
object_id=row.get("object_id") or "",
predicate_id=row.get("predicate_id") or "",
mapping_justification=row.get("mapping_justification")
or "semapv:BackgroundKnowledgeBasedMatching",
mapping_cardinality=row.get("mapping_cardinality") or None,
)
mappings.append(m)
ms.mappings = mappings
return ms
[docs]
def load_label_mapping(path: Path | str) -> LabelMappingSet:
"""Load a label/symbol mapping set from a pysec2pri TSV file.
Accepts the ``symbol2prev`` TSV format (columns ``subject_id``,
``subject_label``, ``object_label``, ``mapping_cardinality``) and the
full SSSOM TSV format (comment-prefixed metadata lines are skipped).
Args:
path: Path to the TSV file to load.
Returns:
A :class:`~pysec2pri.parsers.base.LabelMappingSet` populated from
the file, ready to pass to :func:`resolve_symbols`.
"""
import pandas as pd
from sssom_schema import Mapping
path = Path(path)
df = pd.read_csv(path, sep="\t", dtype=str, comment="#")
ms = LabelMappingSet(
mapping_set_id=str(path),
license="https://creativecommons.org/licenses/by/4.0/",
)
mappings: list[Mapping] = []
for _, row in df.iterrows():
m = Mapping(
subject_id=row.get("subject_id") or "",
subject_label=row.get("subject_label") or None,
object_label=row.get("object_label") or None,
predicate_id=row.get("predicate_id") or "",
mapping_justification=row.get("mapping_justification")
or "semapv:BackgroundKnowledgeBasedMatching",
mapping_cardinality=row.get("mapping_cardinality") or None,
)
mappings.append(m)
ms.mappings = mappings
return ms
[docs]
def resolve_ids(
input_path: Path | str | list[str],
mapping_set: Sec2PriMappingSet,
at: str | list[str] | None = None,
*,
output_path: Path | str | None = None,
suffix: str = "_primary",
sep: str | None = None,
synonyms: str | None = None,
label_mapping_set: Sec2PriMappingSet | None = None,
) -> pd.DataFrame | str | list[str]:
r"""Resolve secondary IDs to primary IDs.
Direct lookup: when *input_path* is a plain identifier string or a list
of identifier strings (i.e. not a path to an existing file), the function
returns the resolved primary ID(s). *at*, *output_path*, *suffix*, and
*sep* are ignored in this mode::
resolve_ids("HMDB00001", hmdb_ms) # -> "HMDB:HMDB0000001"
resolve_ids(["HMDB00001", "HMDB00002"], hmdb_ms) # -> ["...", "..."]
DataFrame mode: when *input_path* points to an existing TSV/CSV
file, *at* is required. The file is read with
``pandas.read_csv`` and for each column named in *at* a new column
``<col><suffix>`` is appended containing the resolved primary IDs.
Identifiers not present in *mapping_set* are kept unchanged.
Args:
input_path: An identifier string, a list of identifier strings, or
the path to a TSV/CSV file.
mapping_set: A :class:`~pysec2pri.parsers.base.Sec2PriMappingSet`
(e.g. the result of ``generate_hgnc()``).
at: Column name(s) to resolve. Required in DataFrame mode;
ignored in direct-lookup mode.
output_path: If given, the resulting DataFrame is written to this
path (DataFrame mode only).
suffix: Suffix appended to each resolved column name
(default ``"_primary"``).
sep: Delimiter for reading the file. Inferred from the extension
when ``None`` (``"\\t"`` for ``.tsv``, ``","`` otherwise).
Returns:
A resolved identifier string, a list of resolved strings (direct-lookup
mode), or a :class:`pandas.DataFrame` with one additional column per
entry in *at* (DataFrame mode).
"""
import pandas as pd
from pysec2pri.update_ids import build_lookup, update_ids
# list direct-lookup mode
if isinstance(input_path, list):
lkp = build_lookup(mapping_set)
return [lkp.get(v, v) for v in input_path]
# single-value lookup mode
input_path_obj = Path(input_path)
if not input_path_obj.exists():
lkp = build_lookup(mapping_set)
return lkp.get(str(input_path), str(input_path))
# DataFrame mode
if at is None:
raise TypeError("resolve_ids() requires 'at' when input_path is a file")
if sep is None:
sep = "\t" if input_path_obj.suffix.lower() == ".tsv" else ","
df = pd.read_csv(input_path_obj, sep=sep, dtype=str)
lkp = build_lookup(mapping_set)
result = update_ids(
df,
mapping_set,
at=at,
suffix=suffix,
lookup=lkp,
synonyms=synonyms,
label_mapping_set=label_mapping_set,
)
if output_path is not None:
output_path = Path(output_path)
out_sep = "\t" if output_path.suffix.lower() == ".tsv" else ","
result.to_csv(output_path, sep=out_sep, index=False)
return result
[docs]
def resolve_symbols(
input_path: Path | str | list[str],
mapping_set: Sec2PriMappingSet,
at: str | list[str] | None = None,
*,
output_path: Path | str | None = None,
suffix: str = "_current",
sep: str | None = None,
synonyms: str | None = None,
) -> pd.DataFrame | str | list[str]:
r"""Resolve previous/alias symbols to current symbols.
Direct lookup: when *input_path* is a plain symbol string or a list
of symbol strings (i.e. not a path to an existing file), the function
returns the resolved current symbol(s). *at*, *output_path*, *suffix*,
and *sep* are ignored in this mode::
resolve_symbols("Ibuprofen", chebi_ms) # -> "ibuprofen"
resolve_symbols(["Ibuprofen", "Glucose"], chebi_ms) # -> ["...", "..."]
DataFrame mode: when *input_path* points to an existing TSV/CSV
file, *at* is required. For each column named in *at* a new
column ``<col><suffix>`` is appended containing the resolved current
symbols. Symbols not present in *mapping_set* are kept unchanged.
Args:
input_path: A symbol string, a list of symbol strings, or the path
to a TSV/CSV file.
mapping_set: A :class:`~pysec2pri.parsers.base.LabelMappingSet`
(e.g. the result of ``generate_hgnc_symbols()``).
at: Column name(s) to resolve. Required in DataFrame mode;
ignored in direct-lookup mode.
output_path: If given, the resulting DataFrame is written to this
path (DataFrame mode only).
suffix: Suffix appended to each resolved column name
(default ``"_current"``).
sep: Delimiter for reading the file. Inferred from the extension
when ``None`` (``"\\t"`` for ``.tsv``, ``","`` otherwise).
Returns:
A resolved symbol string, a list of resolved strings (direct-lookup
mode), or a :class:`pandas.DataFrame` with one additional column per
entry in *at* (DataFrame mode).
"""
import pandas as pd
from pysec2pri.update_ids import build_symbol_lookup, update_symbols
# list direct-lookup mode
if isinstance(input_path, list):
lkp = build_symbol_lookup(mapping_set)
return [lkp.get(v, v) for v in input_path]
# single-value direct-lookup mode
input_path_obj = Path(input_path)
if not input_path_obj.exists():
lkp = build_symbol_lookup(mapping_set)
return lkp.get(str(input_path), str(input_path))
# DataFrame mode
if at is None:
raise TypeError("resolve_symbols() requires 'at' when input_path is a file")
if sep is None:
sep = "\t" if input_path_obj.suffix.lower() == ".tsv" else ","
df = pd.read_csv(input_path_obj, sep=sep, dtype=str)
lkp = build_symbol_lookup(mapping_set)
result = pd.DataFrame(
update_symbols(df, mapping_set, at=at, suffix=suffix, lookup=lkp, synonyms=synonyms)
)
if output_path is not None:
output_path = Path(output_path)
out_sep = "\t" if output_path.suffix.lower() == ".tsv" else ","
result.to_csv(output_path, sep=out_sep, index=False)
return result
[docs]
def list_versions(datasource: str) -> Any:
"""List all available archive versions for a datasource.
For datasources that publish versioned archives (ChEBI, HGNC, UniProt),
this queries the remote archive index and returns all available version
strings sorted in ascending order.
NCBI and HMDB do not maintain versioned archives; calling this function
for those datasources raises :class:`ValueError`.
Args:
datasource: Datasource name, one of ``"chebi"``, ``"hgnc"``, or
``"uniprot"``.
Returns:
Sorted list of version strings. Format depends on the datasource:
- **chebi**: integer release numbers, e.g. ``["200", ..., "245"]``
- **hgnc**: ISO dates, e.g. ``["2023-01-01", ..., "2026-04-07"]``
- **uniprot**: release IDs, e.g. ``["2024_01", "2024_02", ...]``
Raises:
ValueError: If *datasource* is unknown or has no versioned archive.
"""
from pysec2pri.download import list_versions as _list_versions
return _list_versions(datasource)
[docs]
def find_ambiguous(
mapping_set: Sec2PriMappingSet,
) -> AmbiguousMappingSet:
"""Find identifiers that are ambiguous in *mapping_set*.
An identifier is ambiguous when it appears both as a ``subject_id`` (i.e. a
secondary/previous term) and as a current primary identifier. Such
entries cannot be automatically resolved without risk of corrupting
references that are already current.
This is a convenience wrapper around
:meth:`~pysec2pri.parsers.base.Sec2PriMappingSet.find_ambiguous`.
Args:
mapping_set: A :class:`~pysec2pri.parsers.base.Sec2PriMappingSet`
(e.g. the result of ``generate_hgnc()``).
Returns:
An :class:`~pysec2pri.parsers.base.AmbiguousMappingSet` whose
``mappings`` list contains one entry for each conflicting subject, with a
``comment`` explaining the conflict.
"""
return mapping_set.find_ambiguous()