Source code for pysec2pri.exports

"""Export functions for writing mapping sets to various file formats."""

from __future__ import annotations

from collections.abc import Callable
from pathlib import Path
from typing import TYPE_CHECKING

from sssom import MappingSetDataFrame

if TYPE_CHECKING:
    from pysec2pri.parsers.base import Sec2PriMappingSet

__all__ = [
    "WRITERS",
    "write_json",
    "write_name2synonym",
    "write_output",
    "write_owl",
    "write_pri_ids",
    "write_rdf",
    "write_sec2pri",
    "write_secondary",
    "write_sssom",
    "write_symbol2prev",
]


[docs] def write_sssom( mapping_set: Sec2PriMappingSet, output_path: Path | str, ) -> Path: """Write a mapping set to an SSSOM TSV file. Args: mapping_set: The mapping set to write. output_path: Destination ``.sssom.tsv`` file path. Returns: Path to the written file. """ import codecs import re from typing import cast import curies from sssom.parsers import to_mapping_set_dataframe # type: ignore[attr-defined] from sssom.sssom_document import MappingSetDocument from sssom.writers import write_table output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Build a curies.Converter from the curie_map stored on the mapping set. # Using Converter(records=...) preserves prefix casing exactly as declared # and handles pydantic-wrapped Prefix objects (which expose .prefix_url). raw_curie_map: object = mapping_set.curie_map or {} records: list[curies.Record] = [] if isinstance(raw_curie_map, dict): for k, v in raw_curie_map.items(): if isinstance(v, str): uri_prefix: str = v elif hasattr(v, "prefix_url"): uri_prefix = cast(str, v.prefix_url) else: continue records.append(curies.Record(prefix=k, uri_prefix=uri_prefix)) converter = curies.Converter(records=records) doc = MappingSetDocument(mapping_set=mapping_set, converter=converter) msdf = to_mapping_set_dataframe(doc) with output_path.open("w", encoding="utf-8") as f: write_table(msdf, f) # Fix escaped unicode in YAML header (sssom issue) content = output_path.read_text(encoding="utf-8") content = re.sub( r"\\x([0-9a-fA-F]{2})", lambda m: codecs.decode(bytes([int(m.group(1), 16)]), "latin-1"), content, ) output_path.write_text(content, encoding="utf-8") return output_path
def _to_msdf_via_sssom_parser(mapping_set: Sec2PriMappingSet) -> MappingSetDataFrame | None: """Write to a temporary SSSOM TSV then parse back with sssom's own parser. Args: mapping_set: The mapping set to convert. Returns: A fully-validated ``MappingSetDataFrame`` ready for RDF/JSON/OWL serialisation. """ import tempfile from sssom.parsers import parse_sssom_table with tempfile.NamedTemporaryFile( suffix=".sssom.tsv", mode="w", encoding="utf-8", delete=False ) as tmp: tmp_path = Path(tmp.name) try: write_sssom(mapping_set, tmp_path) return parse_sssom_table(str(tmp_path)) finally: tmp_path.unlink(missing_ok=True)
[docs] def write_rdf( mapping_set: Sec2PriMappingSet, output_path: Path | str, serialisation: str = "turtle", ) -> Path: """Write a mapping set to an RDF file. Args: mapping_set: The mapping set to write. output_path: Destination file path (e.g. ``mappings.ttl``). serialisation: RDFLib serialisation format. Returns: Path to the written file. """ from sssom.writers import write_rdf as _sssom_write_rdf output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) msdf = _to_msdf_via_sssom_parser(mapping_set) if msdf is None: raise ValueError("Failed to parse mapping set for RDF serialisation.") with output_path.open("w", encoding="utf-8") as f: _sssom_write_rdf(msdf, f, serialisation=serialisation) return output_path
[docs] def write_json( mapping_set: Sec2PriMappingSet, output_path: Path | str, ) -> Path: """Write a mapping set to an SSSOM JSON file. Args: mapping_set: The mapping set to write. output_path: Destination file path (e.g. ``mappings.json``). Returns: Path to the written file. """ from sssom.writers import write_json as _sssom_write_json output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) msdf = _to_msdf_via_sssom_parser(mapping_set) if msdf is None: raise ValueError("Failed to parse mapping set for JSON serialisation.") with output_path.open("w", encoding="utf-8") as f: _sssom_write_json(msdf, f) return output_path
[docs] def write_owl( mapping_set: Sec2PriMappingSet, output_path: Path | str, serialisation: str = "turtle", ) -> Path: """Write a mapping set to an OWL/RDF file (default: Turtle). Args: mapping_set: The mapping set to write. output_path: Destination file path (e.g. ``mappings_owl.ttl``). serialisation: RDFLib serialisation format. Returns: Path to the written file. """ from sssom.writers import write_owl as _sssom_write_owl output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) msdf = _to_msdf_via_sssom_parser(mapping_set) if msdf is None: raise ValueError("Failed to parse mapping set for OWL serialisation.") with output_path.open("w", encoding="utf-8") as f: _sssom_write_owl(msdf, f, serialisation=serialisation) return output_path
[docs] def write_pri_ids( mapping_set: Sec2PriMappingSet, output_path: Path | str, ) -> Path: """Write unique primary IDs to a text file, one per line. Args: mapping_set: The mapping set to read primary IDs from. output_path: Destination file path. Returns: Path to the written file. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Use the authoritative set when available (never appears in other outputs) all_ids: set[str] = getattr(mapping_set, "_primary_ids", set()) or set() if not all_ids: # Fall back to extracting from mappings for m in mapping_set.mappings or []: # type: ignore[has-type] obj_id = getattr(m, "object_id", None) if obj_id: all_ids.add(str(obj_id)) with output_path.open("w", encoding="utf-8") as f: for pri_id in sorted(all_ids): f.write(f"{pri_id}\n") return output_path
[docs] def write_sec2pri( mapping_set: Sec2PriMappingSet, output_path: Path | str, ) -> Path: """Write secondary to primary ID mappings to a TSV file. Columns: ``subject_id``, ``object_id``, ``predicate_id``, ``mapping_cardinality``. Args: mapping_set: The mapping set to write. output_path: Destination file path (e.g. ``sec2pri.tsv``). Returns: Path to the written file. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) columns = ["subject_id", "object_id", "predicate_id", "mapping_cardinality"] with output_path.open("w", encoding="utf-8") as f: f.write("\t".join(columns) + "\n") for m in mapping_set.mappings or []: # type: ignore[has-type] values = [ str(getattr(m, "subject_id", "") or ""), str(getattr(m, "object_id", "") or ""), str(getattr(m, "predicate_id", "") or ""), str(getattr(m, "mapping_cardinality", "") or ""), ] f.write("\t".join(values) + "\n") return output_path
[docs] def write_name2synonym( mapping_set: Sec2PriMappingSet, output_path: Path | str, ) -> Path: """Write name to synonym mappings to a TSV file. Only rows where at least one of ``subject_label`` or ``object_label`` is set are written. Columns: ``subject_id``, ``subject_label``, ``object_label``. Args: mapping_set: The mapping set to write. output_path: Destination file path (e.g. ``name2synonym.tsv``). Returns: Path to the written file. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) columns = ["subject_id", "subject_label", "object_label"] with output_path.open("w", encoding="utf-8") as f: f.write("\t".join(columns) + "\n") for m in mapping_set.mappings or []: # type: ignore[has-type] subject_label = getattr(m, "subject_label", None) object_label = getattr(m, "object_label", None) if subject_label or object_label: values = [ str(getattr(m, "subject_id", "") or ""), str(subject_label or ""), str(object_label or ""), ] f.write("\t".join(values) + "\n") return output_path
[docs] def write_symbol2prev( mapping_set: Sec2PriMappingSet, output_path: Path | str, ) -> Path: """Write symbol to previous symbol mappings to a TSV file. Only rows where at least one of ``subject_label`` or ``object_label`` is set are written. Columns: ``subject_id``, ``subject_label``, ``object_label``, ``mapping_cardinality``. Args: mapping_set: The mapping set to write. output_path: Destination file path (e.g. ``symbol2prev.tsv``). Returns: Path to the written file. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) columns = ["subject_id", "subject_label", "object_label", "mapping_cardinality"] with output_path.open("w", encoding="utf-8") as f: f.write("\t".join(columns) + "\n") for m in mapping_set.mappings or []: # type: ignore[has-type] subject_label = getattr(m, "subject_label", None) object_label = getattr(m, "object_label", None) if subject_label or object_label: values = [ str(getattr(m, "subject_id", "") or ""), str(subject_label or ""), str(object_label or ""), str(getattr(m, "mapping_cardinality", "") or ""), ] f.write("\t".join(values) + "\n") return output_path
[docs] def write_secondary( mapping_set: Sec2PriMappingSet, output_path: Path | str, ) -> Path: """Write unique secondary IDs (subject_id) to a text file, one per line. Args: mapping_set: The mapping set to read secondary IDs from. output_path: Destination file path. Returns: Path to the written file. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) sec_ids: set[str] = set() for m in mapping_set.mappings or []: # type: ignore[has-type] subj_id = getattr(m, "subject_id", None) if subj_id: sec_ids.add(str(subj_id)) with output_path.open("w", encoding="utf-8") as f: for sec_id in sorted(sec_ids): f.write(f"{sec_id}\n") return output_path
# Registry mapping config output names to writer functions. WRITERS: dict[str, Callable[..., Path]] = { "sssom": write_sssom, "sec2pri": write_sec2pri, "secID2priID": write_sec2pri, "pri_ids": write_pri_ids, "priIDs": write_pri_ids, "secIDs": write_secondary, "name2synonym": write_name2synonym, "symbol2prev": write_symbol2prev, "rdf": write_rdf, "json": write_json, "owl": write_owl, }
[docs] def write_output( mapping_set: Sec2PriMappingSet, output_format: str, output_path: Path | str, ) -> Path: """Write a mapping set in any registered output format. Args: mapping_set: The mapping set to write. output_format: Format name (must be a key in WRITERS). output_path: Path to write to. Returns: Path to the written file. Raises: ValueError: If output_format is not recognized. """ writer = WRITERS.get(output_format) if writer is None: msg = f"Unknown output format: {output_format!r}. Available: {sorted(WRITERS)}" raise ValueError(msg) return writer(mapping_set, output_path)