"""Main functions for pysec2pri.
This module provides functions for parsing biological database
secondary-to-primary mapping files and generating and using the standardized Mapping sets.
"""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from pysec2pri.exports import (
write_json,
write_name2synonym,
write_output,
write_owl,
write_pri_ids,
write_rdf,
write_sec2pri,
write_sssom,
)
from pysec2pri.exports import (
write_symbol2prev as write_symbol_sec2pri,
)
if TYPE_CHECKING:
import pandas as pd
from pysec2pri.diff import MappingDiff
from pysec2pri.parsers.base import Sec2PriMappingSet
from pysec2pri.parsers.base import IdMappingSet, LabelMappingSet
__all__ = [
"combine_mapping_sets",
"generate_chebi",
"generate_chebi_synonyms",
"generate_hgnc",
"generate_hgnc_primary_ids",
"generate_hgnc_symbols",
"generate_hmdb",
"generate_hmdb_proteins",
"generate_ncbi",
"generate_ncbi_symbols",
"generate_uniprot",
"generate_wikidata",
"generate_wikidata_symbols",
"load_label_mapping",
"load_mapping",
"resolve_ids",
"resolve_symbols",
"save",
"write_all_formats",
"write_diff_output",
"write_json",
"write_name2synonym",
"write_output",
"write_owl",
"write_rdf",
"write_sec2pri",
"write_sssom",
"write_symbol_sec2pri",
]
def _auto_download(
datasource: str,
version: str | None = None,
keys: list[str] | None = None,
) -> dict[str, Path]:
"""Download files for *datasource* into a temp dir.
Args:
datasource: Datasource name (e.g. ``"hgnc"``).
version: Optional specific version to download.
keys: Optional list of file-key names to download. When given, only
those keys are fetched (e.g. ``["complete"]``).
"""
import tempfile
from pysec2pri.download import download_datasource
tmpdir = Path(tempfile.mkdtemp(prefix=f"pysec2pri_{datasource}_"))
return download_datasource(datasource, tmpdir, version=version, keys=keys)
[docs]
def generate_chebi(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
subset: str = "3star",
mapping_sets: str = "ids",
) -> Sec2PriMappingSet:
"""Return ChEBI mappings (IDs, synonyms, or both).
Downloads the latest release automatically when ``input_path`` is omitted.
Pass an SDF file (releases < 245) or a directory of TSV flat files
(releases >= 245) to use a local copy.
Args:
input_path: Local SDF file or TSV directory. Auto-downloaded if ``None``.
version: Release number (e.g. ``"245"``).
show_progress: Whether to show progress bars.
subset: ``"3star"`` (default) or ``"complete"``.
mapping_sets: ``"ids"`` (default), ``"synonyms"``, or ``"all"``.
"""
import tempfile
from pysec2pri.download import check_chebi_release
from pysec2pri.parsers import ChEBIParser
from pysec2pri.parsers.chebi import ChEBIDownloader
if input_path is None:
if version is None:
version = check_chebi_release().version or "245"
downloader = ChEBIDownloader(version=version, subset=subset)
tmpdir = Path(tempfile.mkdtemp(prefix=f"pysec2pri_chebi_{version}_"))
downloader.download(tmpdir, version=version)
input_path = tmpdir
parser = ChEBIParser(version=version, show_progress=show_progress, subset=subset)
if mapping_sets == "synonyms":
return parser.parse_synonyms(Path(input_path))
if mapping_sets == "all":
ids = parser.parse(Path(input_path))
syns = parser.parse_synonyms(Path(input_path))
return combine_mapping_sets(ids, syns)
return parser.parse(Path(input_path))
[docs]
def generate_chebi_synonyms(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
subset: str = "3star",
) -> Sec2PriMappingSet:
"""Return ChEBI synonym (name) mappings."""
return generate_chebi(
input_path=input_path,
version=version,
show_progress=show_progress,
subset=subset,
mapping_sets="synonyms",
)
[docs]
def generate_hgnc(
input_path: Path | str | None = None,
complete_set_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return HGNC secondary to primary ID mappings.
Downloads the withdrawn and complete set files automatically when
``input_path`` / ``complete_set_path`` are omitted. The complete set is
used to populate the full list of current primary IDs so that
:meth:`~pysec2pri.parsers.base.Sec2PriMappingSet.to_pri_ids` returns the
authoritative list (~45 k IDs) rather than just the ~5 k primaries that
happen to have a secondary.
Args:
input_path: Local HGNC withdrawn TSV. Auto-downloaded if ``None``.
complete_set_path: Local HGNC complete set TSV. Auto-downloaded if
``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HGNCParser
if input_path is None or complete_set_path is None:
files = _auto_download("hgnc", version)
if input_path is None:
input_path = files["withdrawn"]
if complete_set_path is None:
complete_set_path = files["complete"]
parser = HGNCParser(version=version, show_progress=show_progress)
return parser.parse(Path(input_path), complete_set_path=Path(complete_set_path))
[docs]
def generate_hgnc_primary_ids(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return a mapping set containing the full list of current HGNC primary IDs.
Only the HGNC complete set file is downloaded/read. The returned mapping
set has an empty ``mappings`` list; its ``_primary_ids`` store is
populated with every current HGNC ID so that ``to_pri_ids()`` produces
the authoritative complete list, not just the subset of primaries that
happen to have an associated secondary.
Args:
input_path: Local HGNC complete set TSV. Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HGNCParser
if input_path is None:
input_path = _auto_download("hgnc", version, keys=["complete"])["complete"]
parser = HGNCParser(version=version, show_progress=show_progress)
return parser.parse_primary_ids(Path(input_path))
[docs]
def generate_hgnc_symbols(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
statuses: list[str] | None = None,
) -> Sec2PriMappingSet:
"""Return HGNC symbol to previous-symbol mappings.
Downloads the complete set file automatically when ``input_path`` is omitted.
Args:
input_path: Local HGNC complete set TSV. Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
statuses: Entry statuses to include (e.g. ``["Approved"]``).
"""
from pysec2pri.parsers import HGNCParser
if input_path is None:
input_path = _auto_download("hgnc", version)["complete"]
parser = HGNCParser(version=version, show_progress=show_progress)
return parser.parse_symbols(Path(input_path), statuses=statuses)
[docs]
def generate_ncbi(
input_path: Path | str | None = None,
tax_id: str = "9606",
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return NCBI Gene secondary to primary ID mappings.
Downloads the gene history file automatically when ``input_path`` is omitted.
Args:
input_path: Local gene_history file. Auto-downloaded if ``None``.
tax_id: NCBI taxonomy ID to filter (default: ``"9606"`` for human).
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import NCBIParser
if input_path is None:
input_path = _auto_download("ncbi", version)["gene_history"]
parser = NCBIParser(version=version, show_progress=show_progress)
return parser.parse(Path(input_path), tax_id=tax_id)
[docs]
def generate_ncbi_symbols(
input_path: Path | str | None = None,
tax_id: str = "9606",
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return NCBI Gene symbol to previous-symbol mappings.
Downloads the gene_info file automatically when ``input_path`` is omitted.
Args:
input_path: Local gene_info file. Auto-downloaded if ``None``.
tax_id: NCBI taxonomy ID to filter (default: ``"9606"`` for human).
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import NCBIParser
if input_path is None:
input_path = _auto_download("ncbi", version)["gene_info"]
parser = NCBIParser(version=version, show_progress=show_progress)
return parser.parse_symbols(Path(input_path), tax_id=tax_id)
[docs]
def generate_uniprot(
input_path: Path | str | None = None,
delac_file: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return UniProt secondary to primary accession mappings.
Downloads sec_ac.txt and delac_sp.txt automatically when ``input_path``
is omitted.
Args:
input_path: Local sec_ac.txt. Auto-downloaded if ``None``.
delac_file: Local delac_sp.txt.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import UniProtParser
if input_path is None:
files = _auto_download("uniprot", version)
input_path = files.get("sec_ac") or next(iter(files.values()))
if delac_file is None:
delac_file = files.get("delac_sp")
parser = UniProtParser(version=version, show_progress=show_progress)
return parser.parse(
Path(input_path),
delac_path=Path(delac_file) if delac_file else None,
)
[docs]
def generate_hmdb(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return HMDB metabolite secondary to primary accession mappings.
Downloads hmdb_metabolites.xml automatically when ``input_path`` is omitted.
Args:
input_path: Local hmdb_metabolites.xml (or .zip/.gz). Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HMDBParser
if input_path is None:
input_path = _auto_download("hmdb", version)["metabolites"]
parser = HMDBParser(version=version, show_progress=show_progress)
return parser.parse(Path(input_path))
[docs]
def generate_hmdb_proteins(
input_path: Path | str | None = None,
version: str | None = None,
show_progress: bool = True,
) -> Sec2PriMappingSet:
"""Return HMDB protein secondary to primary accession mappings.
Downloads hmdb_proteins.xml automatically when ``input_path`` is omitted.
Args:
input_path: Local hmdb_proteins.xml (or .zip/.gz). Auto-downloaded if ``None``.
version: Version string for metadata.
show_progress: Whether to show progress bars.
"""
from pysec2pri.parsers import HMDBParser
if input_path is None:
input_path = _auto_download("hmdb", version)["proteins"]
parser = HMDBParser(version=version, show_progress=show_progress)
return parser.parse_proteins(Path(input_path))
[docs]
def generate_wikidata(
input_path: Path | str | None = None,
entity_type: str | None = None,
version: str | None = None,
endpoint: str | None = None,
show_progress: bool = True,
test_subset: bool = False,
) -> Sec2PriMappingSet:
"""Return Wikidata redirect mappings via SPARQL (or a pre-downloaded TSV).
Queries the QLever Wikidata endpoint when ``input_path`` is omitted.
If ``entity_type`` is ``None``, all entity types (metabolites, genes,
proteins) are queried and combined.
Args:
input_path: Pre-downloaded TSV file. Queries SPARQL if ``None``.
entity_type: ``"metabolites"``, ``"chemicals"``, ``"genes"``, or
`"proteins"``. Queries all types when ``None``.
version: Version string for metadata (defaults to today's date).
endpoint: Custom SPARQL endpoint URL.
show_progress: Whether to show progress bars.
test_subset: Use test queries limited to 10 results.
"""
from pysec2pri.parsers import WikidataParser
parser = WikidataParser(
version=version,
show_progress=show_progress,
entity_type=entity_type or "metabolites",
endpoint=endpoint,
test_subset=test_subset,
)
if input_path is not None:
return parser.parse_from_file(Path(input_path))
if entity_type is None:
return parser.parse_all()
return parser.parse()
[docs]
def generate_wikidata_symbols(
input_path: Path | str | None = None,
entity_type: str | None = None,
version: str | None = None,
endpoint: str | None = None,
show_progress: bool = True,
test_subset: bool = False,
) -> LabelMappingSet:
"""Return Wikidata label mappings (previous label to current label).
Queries the QLever Wikidata endpoint when ``input_path`` is omitted.
If ``entity_type`` is ``None``, all entity types are queried and
their label mappings combined.
Args:
input_path: Pre-downloaded TSV file. Queries SPARQL if ``None``.
entity_type: ``"metabolites"``, ``"chemicals"``, ``"genes"``, or
``"proteins"``. Queries all types when ``None``.
version: Version string for metadata.
endpoint: Custom SPARQL endpoint URL.
show_progress: Whether to show progress bars.
test_subset: Use test queries limited to 10 results.
Returns:
:class:`~pysec2pri.parsers.base.LabelMappingSet` with label mappings.
"""
from pysec2pri.parsers import WikidataParser
entity_types = ["metabolites", "genes", "proteins"] if entity_type is None else [entity_type]
sets = []
for etype in entity_types:
parser = WikidataParser(
version=version,
show_progress=show_progress,
entity_type=etype,
endpoint=endpoint,
test_subset=test_subset,
)
sets.append(parser.parse_symbols(Path(input_path) if input_path else None))
if len(sets) == 1:
return sets[0]
# Combine multiple LabelMappingSets into one
all_mappings = [m for ms in sets for m in (ms.mappings or [])]
combined = sets[0]
combined.mappings = all_mappings
combined.compute_cardinalities()
return combined
[docs]
def combine_mapping_sets(
id_mappings: Sec2PriMappingSet | None,
synonym_mappings: Sec2PriMappingSet | None,
) -> Sec2PriMappingSet:
"""Combine two mapping sets into one.
Args:
id_mappings: First mapping set (e.g. ID mappings).
synonym_mappings: Second mapping set (e.g. synonym mappings).
Returns:
Combined mapping set.
Raises:
ValueError: If both mapping sets are ``None``.
"""
if id_mappings is None and synonym_mappings is None:
msg = "At least one mapping set must be provided"
raise ValueError(msg)
if id_mappings is None:
return synonym_mappings # type: ignore[return-value]
if synonym_mappings is None:
return id_mappings
combined = list(id_mappings.mappings or [])
combined.extend(synonym_mappings.mappings or [])
id_mappings.mappings = combined
return id_mappings
# Output helpers
_FORMAT_EXTENSIONS: dict[str, str] = {
"rdf": ".ttl",
"owl": "_owl.ttl",
"json": ".json",
}
def _output_filename(base_name: str, fmt: str) -> Path:
"""Return the default filename for *base_name* in format *fmt*."""
ext = _FORMAT_EXTENSIONS.get(fmt, ".tsv")
if fmt == "owl":
return Path(f"{base_name}{ext}")
return Path(f"{base_name}_{fmt}{ext}")
[docs]
def save(
mapping_set: Sec2PriMappingSet,
output_format: str,
output: Path | str | None = None,
*,
base_name: str,
) -> Path:
"""Write *mapping_set* and return the path that was written.
Delegates to :meth:`~pysec2pri.parsers.base.Sec2PriMappingSet.save` for
single formats and :func:`write_all_formats` for ``"all"``.
Args:
mapping_set: The mapping set to write.
output_format: One of ``sssom``, ``sec2pri``, ``pri_ids``,
``name2synonym``, ``symbol_sec2pri``, ``pri_symbols``,
``rdf``, ``json``, ``owl``, or ``all``.
output: Explicit output path or directory. When ``None``, a
default name derived from *base_name* is used.
base_name: Stem used to derive file names, e.g. ``"hgnc_2026-04-07"``.
Returns:
The directory (for ``"all"``) or file path that was written.
"""
out = Path(output) if output else None
if output_format == "all":
if out is None:
out_dir = Path(base_name)
elif out.suffix:
out_dir = out.parent / base_name
else:
out_dir = out
write_all_formats(mapping_set, out_dir, base_name)
return out_dir
# Resolve output path, then delegate to the mapping-set method
if out is None:
out_path = _output_filename(base_name, output_format)
elif out.is_dir():
out_path = out / _output_filename(base_name, output_format).name
else:
out_path = out
return mapping_set.save(output_format, out_path)
[docs]
def write_diff_output(
result: MappingDiff,
output_path: Path,
) -> None:
"""Write diff results to a TSV file.
Args:
result: MappingDiff object with added/removed/changed mappings.
output_path: Path to write the TSV file.
"""
import polars as pl
dfs = []
if result.added_count > 0:
added_df = result.added.with_columns(
pl.lit("added").alias("change_type"),
pl.lit(None).alias("old_subject_id"),
).select(
[
"change_type",
"object_id",
pl.col("subject_id").alias("new_subject_id"),
"old_subject_id",
]
)
dfs.append(added_df)
if result.removed_count > 0:
removed_df = result.removed.with_columns(
pl.lit("removed").alias("change_type"),
pl.lit(None).alias("new_subject_id"),
).select(
[
"change_type",
"object_id",
"new_subject_id",
pl.col("subject_id").alias("old_subject_id"),
]
)
dfs.append(removed_df)
if result.changed_count > 0:
changed_df = result.changed.with_columns(
pl.lit("changed").alias("change_type"),
).select(
[
"change_type",
"object_id",
"new_subject_id",
"old_subject_id",
]
)
dfs.append(changed_df)
if dfs:
combined = pl.concat(dfs)
combined.write_csv(output_path, separator="\t")
[docs]
def load_mapping(path: Path | str) -> IdMappingSet:
"""Load an ID mapping set from a pysec2pri TSV file.
Accepts the ``sec2pri`` TSV format (columns ``subject_id``, ``object_id``,
``predicate_id``, ``mapping_cardinality``) and the full SSSOM TSV format
(comment-prefixed metadata lines are skipped automatically).
Args:
path: Path to the TSV file to load.
Returns:
An :class:`~pysec2pri.parsers.base.IdMappingSet` populated from the
file, ready to pass to :func:`resolve_ids`.
"""
import pandas as pd
from sssom_schema import Mapping
path = Path(path)
df = pd.read_csv(path, sep="\t", dtype=str, comment="#")
ms = IdMappingSet(
mapping_set_id=str(path),
license="https://creativecommons.org/licenses/by/4.0/",
)
mappings: list[Mapping] = []
for _, row in df.iterrows():
m = Mapping(
subject_id=row.get("subject_id") or "",
object_id=row.get("object_id") or "",
predicate_id=row.get("predicate_id") or "",
mapping_justification=row.get("mapping_justification")
or "semapv:BackgroundKnowledgeBasedMatching",
mapping_cardinality=row.get("mapping_cardinality") or None,
)
mappings.append(m)
ms.mappings = mappings
return ms
[docs]
def load_label_mapping(path: Path | str) -> LabelMappingSet:
"""Load a label/symbol mapping set from a pysec2pri TSV file.
Accepts the ``symbol2prev`` TSV format (columns ``subject_id``,
``subject_label``, ``object_label``, ``mapping_cardinality``) and the
full SSSOM TSV format (comment-prefixed metadata lines are skipped).
Args:
path: Path to the TSV file to load.
Returns:
A :class:`~pysec2pri.parsers.base.LabelMappingSet` populated from
the file, ready to pass to :func:`resolve_symbols`.
"""
import pandas as pd
from sssom_schema import Mapping
path = Path(path)
df = pd.read_csv(path, sep="\t", dtype=str, comment="#")
ms = LabelMappingSet(
mapping_set_id=str(path),
license="https://creativecommons.org/licenses/by/4.0/",
)
mappings: list[Mapping] = []
for _, row in df.iterrows():
m = Mapping(
subject_id=row.get("subject_id") or "",
subject_label=row.get("subject_label") or None,
object_label=row.get("object_label") or None,
predicate_id=row.get("predicate_id") or "",
mapping_justification=row.get("mapping_justification")
or "semapv:BackgroundKnowledgeBasedMatching",
mapping_cardinality=row.get("mapping_cardinality") or None,
)
mappings.append(m)
ms.mappings = mappings
return ms
[docs]
def resolve_ids(
input_path: Path | str | list[str],
mapping_set: Sec2PriMappingSet,
at: str | list[str] | None = None,
*,
output_path: Path | str | None = None,
suffix: str = "_primary",
sep: str | None = None,
) -> pd.DataFrame | str | list[str]:
r"""Resolve secondary IDs to primary IDs.
Direct lookup: when *input_path* is a plain identifier string or a list
of identifier strings (i.e. not a path to an existing file), the function
returns the resolved primary ID(s). *at*, *output_path*, *suffix*, and
*sep* are ignored in this mode::
resolve_ids("HMDB00001", hmdb_ms) # → "HMDB:HMDB0000001"
resolve_ids(["HMDB00001", "HMDB00002"], hmdb_ms) # → ["...", "..."]
DataFrame mode: when *input_path* points to an existing TSV/CSV
file, *at* is required. The file is read with
``pandas.read_csv`` and for each column named in *at* a new column
``<col><suffix>`` is appended containing the resolved primary IDs.
Identifiers not present in *mapping_set* are kept unchanged.
Args:
input_path: An identifier string, a list of identifier strings, or
the path to a TSV/CSV file.
mapping_set: A :class:`~pysec2pri.parsers.base.Sec2PriMappingSet`
(e.g. the result of ``generate_hgnc()``).
at: Column name(s) to resolve. Required in DataFrame mode;
ignored in direct-lookup mode.
output_path: If given, the resulting DataFrame is written to this
path (DataFrame mode only).
suffix: Suffix appended to each resolved column name
(default ``"_primary"``).
sep: Delimiter for reading the file. Inferred from the extension
when ``None`` (``"\\t"`` for ``.tsv``, ``","`` otherwise).
Returns:
A resolved identifier string, a list of resolved strings (direct-lookup
mode), or a :class:`pandas.DataFrame` with one additional column per
entry in *at* (DataFrame mode).
"""
import pandas as pd
from pysec2pri.update_ids import build_lookup, update_ids
# list direct-lookup mode
if isinstance(input_path, list):
lkp = build_lookup(mapping_set)
return [lkp.get(v, v) for v in input_path]
# single-value lookup mode
input_path_obj = Path(input_path)
if not input_path_obj.exists():
lkp = build_lookup(mapping_set)
return lkp.get(str(input_path), str(input_path))
# DataFrame mode
if at is None:
raise TypeError("resolve_ids() requires 'at' when input_path is a file")
if sep is None:
sep = "\t" if input_path_obj.suffix.lower() == ".tsv" else ","
df = pd.read_csv(input_path_obj, sep=sep, dtype=str)
lkp = build_lookup(mapping_set)
result = pd.DataFrame(update_ids(df, mapping_set, at=at, suffix=suffix, lookup=lkp))
if output_path is not None:
output_path = Path(output_path)
out_sep = "\t" if output_path.suffix.lower() == ".tsv" else ","
result.to_csv(output_path, sep=out_sep, index=False)
return result
[docs]
def resolve_symbols(
input_path: Path | str | list[str],
mapping_set: Sec2PriMappingSet,
at: str | list[str] | None = None,
*,
output_path: Path | str | None = None,
suffix: str = "_current",
sep: str | None = None,
) -> pd.DataFrame | str | list[str]:
r"""Resolve previous/alias symbols to current symbols.
Direct lookup: when *input_path* is a plain symbol string or a list
of symbol strings (i.e. not a path to an existing file), the function
returns the resolved current symbol(s). *at*, *output_path*, *suffix*,
and *sep* are ignored in this mode::
resolve_symbols("Ibuprofen", chebi_ms) # → "ibuprofen"
resolve_symbols(["Ibuprofen", "Glucose"], chebi_ms) # → ["...", "..."]
DataFrame mode: when *input_path* points to an existing TSV/CSV
file, *at* is required. For each column named in *at* a new
column ``<col><suffix>`` is appended containing the resolved current
symbols. Symbols not present in *mapping_set* are kept unchanged.
Args:
input_path: A symbol string, a list of symbol strings, or the path
to a TSV/CSV file.
mapping_set: A :class:`~pysec2pri.parsers.base.LabelMappingSet`
(e.g. the result of ``generate_hgnc_symbols()``).
at: Column name(s) to resolve. Required in DataFrame mode;
ignored in direct-lookup mode.
output_path: If given, the resulting DataFrame is written to this
path (DataFrame mode only).
suffix: Suffix appended to each resolved column name
(default ``"_current"``).
sep: Delimiter for reading the file. Inferred from the extension
when ``None`` (``"\\t"`` for ``.tsv``, ``","`` otherwise).
Returns:
A resolved symbol string, a list of resolved strings (direct-lookup
mode), or a :class:`pandas.DataFrame` with one additional column per
entry in *at* (DataFrame mode).
"""
import pandas as pd
from pysec2pri.update_ids import build_symbol_lookup, update_symbols
# list direct-lookup mode
if isinstance(input_path, list):
lkp = build_symbol_lookup(mapping_set)
return [lkp.get(v, v) for v in input_path]
# single-value direct-lookup mode
input_path_obj = Path(input_path)
if not input_path_obj.exists():
lkp = build_symbol_lookup(mapping_set)
return lkp.get(str(input_path), str(input_path))
# DataFrame mode
if at is None:
raise TypeError("resolve_symbols() requires 'at' when input_path is a file")
if sep is None:
sep = "\t" if input_path_obj.suffix.lower() == ".tsv" else ","
df = pd.read_csv(input_path_obj, sep=sep, dtype=str)
lkp = build_symbol_lookup(mapping_set)
result = pd.DataFrame(update_symbols(df, mapping_set, at=at, suffix=suffix, lookup=lkp))
if output_path is not None:
output_path = Path(output_path)
out_sep = "\t" if output_path.suffix.lower() == ".tsv" else ","
result.to_csv(output_path, sep=out_sep, index=False)
return result