Source code for pysec2pri.update_ids

"""Resolve secondary identifiers to primary identifiers using a MappingSet.

Typical usage
-------------
Single string (possibly separated by commas/semicolons/pipes/whitespace)::

    from pysec2pri import generate_hgnc
    from pysec2pri.update_ids import update_ids

    ms = generate_hgnc()
    update_ids("HGNC:1234|HGNC:5678", ms)
    # {'HGNC:1234': 'HGNC:9999', 'HGNC:5678': 'HGNC:5678'}

List of strings::

    update_ids(["HGNC:1234", "HGNC:5678"], ms)

Pandas DataFrame, annotate one or more columns::

    import pandas as pd

    df = pd.DataFrame({"gene_id": ["HGNC:1234", "HGNC:5678"]})
    update_ids(df, ms, at="gene_id")
    # returns df with an extra column "gene_id_primary"

    # Multiple columns at once:
    update_ids(df, ms, at=["gene_id", "alt_id"])
    # returns df with "gene_id_primary" and "alt_id_primary" columns added

Notes
-----
* Identifiers that are not found in the mapping set are returned/kept
  as-is.
* Identifiers separated by common delimiters (``|``, ``,``, ``;``,
  whitespace) inside a single string are each looked up individually.
* The mapping look-up is done once against the full set of unique IDs
  to avoid repeated scans of large mapping sets.
* **Ambiguous identifiers**, those that appear both as a secondary ID
  in the mapping set and as a current primary ID, are left blank in
  the resolved output.  A warning is emitted listing every ambiguous
  token so the user can resolve them manually.
"""

from __future__ import annotations

import re
from typing import TYPE_CHECKING, Union, overload

if TYPE_CHECKING:
    import pandas as pd

from pysec2pri.logging import logger
from pysec2pri.parsers.base import Sec2PriMappingSet

_IAO_DEPRECATION = "IAO:0100001"

# Separator pattern: pipe, comma, semicolon, or whitespace
_SEP = re.compile(r"[|,;\s]+")

# Type alias for the flexible input
IdsInput = Union[str, list[str], "pd.DataFrame"]

# Sentinel for "this ID is ambiguous, do not replace"
_AMBIGUOUS = object()


# Helpers


def _build_lookup(mapping_set: Sec2PriMappingSet) -> dict[str, str]:
    """Return a ``{subject_id: object_id}`` dict from *mapping_set*.

    Only mappings with a non-empty ``subject_id`` are included.  If a
    secondary ID maps to the withdrawn sentinel it is kept as-is so
    callers can decide what to do with it.
    """
    lookup: dict[str, str] = {}
    for m in mapping_set.mappings or []:
        sec = str(getattr(m, "subject_id", None) or "")
        pri = str(getattr(m, "object_id", None) or "")
        if sec:
            lookup[sec] = pri
    return lookup


# ---------------------------------------------------------------------------
# Synonym-hint helpers (public)
# ---------------------------------------------------------------------------



[docs]
def build_alias_index(mapping_set: Sec2PriMappingSet) -> dict[str, list[str]]:
    """Return ``{object_id: [subject_labels linked via non-IAO predicates]}``.

    Builds an index of all *non-deprecation* alias mappings in a
    :class:`~pysec2pri.parsers.base.LabelMappingSet`.  Only entries whose
    ``predicate_id`` is **not** ``IAO:0100001`` are included; deprecation
    (``IAO:0100001`` / "term replaced by") mappings are deliberately excluded
    because they express history, not active aliasing.

    This index is used by :func:`resolve_ambiguous_with_hints` to confirm
    whether a user-supplied alias belongs to the secondary mapping's target
    (confirming secondary usage) or to the entity's own primary entry
    (confirming primary usage).

    Args:
        mapping_set: A :class:`~pysec2pri.parsers.base.LabelMappingSet`
            (e.g. the result of ``generate_hgnc_labels()``).

    Returns:
        Dict mapping each ``object_id`` to the list of ``subject_label``
        values that point to it via a non-IAO predicate.
    """
    index: dict[str, list[str]] = {}
    for m in mapping_set.mappings or []:
        pred = str(getattr(m, "predicate_id", None) or "")
        if pred == _IAO_DEPRECATION:
            continue
        obj_id = str(getattr(m, "object_id", None) or "")
        subj_lbl = str(getattr(m, "subject_label", None) or "")
        if obj_id and subj_lbl:
            index.setdefault(obj_id, []).append(subj_lbl)
    return index




[docs]
def build_primary_token_to_id(mapping_set: Sec2PriMappingSet) -> dict[str, str]:
    """Return ``{primary_label: primary_id}`` from a label mapping set.

    Collects every ``(object_label, object_id)`` pair seen in the mappings
    and, where available, the ``_primary_labels`` store.  Useful for
    translating a primary label string into its CURIE so that
    :func:`build_alias_index` (keyed by object_id) can be looked up.

    Args:
        mapping_set: A :class:`~pysec2pri.parsers.base.LabelMappingSet`.

    Returns:
        Dict ``{primary_label: primary_id}``.
    """
    result: dict[str, str] = {}
    for m in mapping_set.mappings or []:
        lbl = str(getattr(m, "object_label", None) or "")
        oid = str(getattr(m, "object_id", None) or "")
        if lbl and oid:
            result[lbl] = oid
    stored: dict[str, set[str]] | None = getattr(mapping_set, "_primary_labels", None)
    if isinstance(stored, dict):
        for sym, ids in stored.items():
            if sym and ids and sym not in result:
                result[sym] = next(iter(ids))
    return result




[docs]
def resolve_ambiguous_with_hints(
    ambiguous_token: str,
    user_aliases: list[str],
    lkp: dict[str, str],
    alias_index: dict[str, list[str]],
    token_to_id: dict[str, str] | None = None,
) -> tuple[str, str | None]:
    """Attempt to resolve an ambiguous label or ID using user-provided alias hints.

    An ambiguous token appears both as a current primary entry **and** as a
    secondary (subject) in a mapping that points to a different primary.

    Two resolution cases are checked:

    1. **Secondary usage**: at least one of the user-supplied aliases matches
       the target token itself (its primary label or primary ID), or appears
       among the non-IAO aliases of the mapping's *target*
       (``lkp[ambiguous_token]``).  This confirms the token is being used as
       a secondary alias of the target, returns ``(target_token, target_id)``.

    2. **Primary usage**: at least one of the user-supplied aliases appears
       among the non-IAO aliases of the token's *own* primary entry.  This
       confirms the token is being used as a standalone primary,
       returns ``(ambiguous_token, own_id)``.

    If neither case applies the ambiguity cannot be resolved and
    ``("", None)`` is returned.

    Args:
        ambiguous_token:
            The label or ID that is both primary and secondary.
        user_aliases:
            Alias strings provided by the user (e.g. from a ``known_alias``
            column) to help determine which entity is actually meant.
        lkp:
            ``{secondary_token: resolved_token}`` lookup (from
            :func:`build_label_lookup` or :func:`build_lookup`).
        alias_index:
            ``{primary_id: [non-IAO alias strings]}`` built via
            :func:`build_alias_index`.
        token_to_id:
            ``{primary_token: primary_id}``.  When ``None`` the token is
            treated as its own ID, which is appropriate for ID mapping sets
            where the token already is a CURIE (e.g. ``"HGNC:53564"``).

    Returns:
        A ``(resolved_token, resolved_id)`` tuple:

        * Secondary case -> ``(target_token, target_id)``
        * Primary case   -> ``(ambiguous_token, own_id)``
        * Unresolvable   -> ``("", None)``
    """
    if not user_aliases:
        return ("", None)

    target_token = lkp.get(ambiguous_token, "")

    def _id(tok: str) -> str:
        return (token_to_id or {}).get(tok, tok)

    target_id = _id(target_token) if target_token else ""

    user_set = set(user_aliases)

    # Case 1: secondary usage.
    # The user alias matches the target's primary label, its primary ID,
    # *or* one of its non-IAO alias strings.  Matching the primary label
    # directly covers the common case where the user writes the current
    # label (e.g. "RNF141") rather than a secondary alias.
    if target_token and (
        target_token in user_set
        or (target_id and target_id in user_set)
        or (target_id and user_set & set(alias_index.get(target_id, [])))
    ):
        return (target_token, target_id if target_id else None)

    # Case 2: primary usage. User aliases overlap own primary's alias list
    own_id = _id(ambiguous_token)
    if own_id and user_set & set(alias_index.get(own_id, [])):
        return (ambiguous_token, own_id if own_id else None)

    return ("", None)




[docs]
def build_ambiguous_set(mapping_set: Sec2PriMappingSet) -> set[str]:
    """Return the set of *ambiguous* subject IDs in *mapping_set*.

    An identifier is ambiguous when it appears both as a ``subject_id``
    (i.e. a secondary/previous term) and as a current primary
    identifier, either in the explicitly stored ``_primary_ids`` set or
    among the ``object_id`` values of the mappings.

    When such overlap exists a naïve replacement could silently corrupt
    references that already use the current entity, so the resolver
    intentionally leaves those cells blank.

    Args:
        mapping_set: Any :class:`~pysec2pri.parsers.base.Sec2PriMappingSet`.

    Returns:
        Set of ID strings that are both secondary and primary.  Empty set
        when no ambiguity is detected.
    """
    stored: set[str] = (
        object.__getattribute__(mapping_set, "_primary_ids")
        if hasattr(mapping_set, "_primary_ids")
        else set()
    )
    primary_ids: set[str] = stored or {
        str(getattr(m, "object_id", None) or "") for m in (mapping_set.mappings or [])
    } - {""}
    subject_ids: set[str] = {
        str(getattr(m, "subject_id", None) or "") for m in (mapping_set.mappings or [])
    } - {""}
    return subject_ids & primary_ids




[docs]
def build_ambiguous_labels_set(mapping_set: Sec2PriMappingSet) -> set[str]:
    """Return the set of *ambiguous* subject labels in *mapping_set*.

    Analogous to :func:`build_ambiguous_set` but operates on
    ``subject_label`` / ``object_label`` (label) mappings.

    Args:
        mapping_set: Any :class:`~pysec2pri.parsers.base.LabelMappingSet`.

    Returns:
        Set of label strings that are both secondary and primary.  Empty
        set when no ambiguity is detected.
    """
    stored: dict[str, set[str]] | set[str] = (
        object.__getattribute__(mapping_set, "_primary_labels")
        if hasattr(mapping_set, "_primary_labels")
        else set()
    )
    # _primary_labels is stored as dict[label, set[ids]]; extract the key set.
    primary_labels: set[str] = (set(stored.keys()) if isinstance(stored, dict) else stored) or {
        str(getattr(m, "object_label", None) or "") for m in (mapping_set.mappings or [])
    } - {""}
    subject_labels: set[str] = {
        str(getattr(m, "subject_label", None) or "") for m in (mapping_set.mappings or [])
    } - {""}
    return subject_labels & primary_labels



def _warn_ambiguous(ambiguous_found: set[str], kind: str = "ID") -> None:
    """Emit a structured warning for every ambiguous token encountered.

    Called after a resolver pass to report all tokens that were left blank
    because their resolution was ambiguous.

    Args:
        ambiguous_found: Set of token strings that were ambiguous.
        kind: Human-readable label for the token type (``"ID"`` or
            ``"label"``), used in the warning message.
    """
    if not ambiguous_found:
        return
    count = len(ambiguous_found)
    listed = ", ".join(sorted(ambiguous_found))
    logger.warning(
        "%d ambiguous %s(s) were left blank in the resolved output because "
        "the same %s appears both as a current primary %s in this datasource "
        "AND as a secondary/previous %s that maps to a different entry. "
        "Automatic replacement would be unsafe, please resolve these manually. "
        "Ambiguous %s(s): %s",
        count,
        kind,
        kind.lower(),
        kind.lower(),
        kind.lower(),
        kind,
        listed,
    )


def _split_ids(raw: str) -> list[str]:
    """Split *raw* on common delimiters; strips empty tokens."""
    return [tok for tok in _SEP.split(raw.strip()) if tok]


def _resolve_tokens(
    tokens: list[str],
    lookup: dict[str, str],
    ambiguous: set[str],
    ambiguous_found: set[str],
) -> list[str]:
    """Map each token to its primary ID, blank for ambiguous, self for unknown."""
    result: list[str] = []
    for tok in tokens:
        if tok in ambiguous:
            ambiguous_found.add(tok)
            result.append("")
        else:
            result.append(lookup.get(tok, tok))
    return result


def _resolve_string(
    raw: str,
    lookup: dict[str, str],
    ambiguous: set[str],
    ambiguous_found: set[str],
) -> str:
    """Resolve all IDs inside *raw*, rejoining with the original separator."""
    sep_match = re.search(r"[|,;\s]", raw)
    sep = sep_match.group(0) if sep_match else ""
    tokens = _split_ids(raw)
    resolved = _resolve_tokens(tokens, lookup, ambiguous, ambiguous_found)
    return sep.join(resolved)


# Public API



[docs]
def build_lookup(mapping_set: Sec2PriMappingSet) -> dict[str, str]:
    """Return a ``{secondary_id: primary_id}`` dictionary.

    Useful when you want to apply the look-up yourself or cache it for
    repeated calls.

    Args:
        mapping_set: A :class:`~pysec2pri.parsers.base.Sec2PriMappingSet`
            (e.g. the object returned by ``generate_hgnc()``).

    Returns:
        Dictionary mapping every secondary ID to its current primary ID.
    """
    return _build_lookup(mapping_set)



# --- dispatch helpers (keep public functions below C901 threshold) ---


def _update_str(
    ids: str,
    lkp: dict[str, str],
    amb: set[str],
    kind: str,
) -> dict[str, str]:
    tokens = _split_ids(ids)
    unique = dict.fromkeys(tokens)
    ambiguous_found: set[str] = set()
    result: dict[str, str] = {}
    for tok in unique:
        if tok in amb:
            ambiguous_found.add(tok)
            result[tok] = ""
        else:
            result[tok] = lkp.get(tok, tok)
    _warn_ambiguous(ambiguous_found, kind=kind)
    return result


def _update_list(
    ids: list[str],
    lkp: dict[str, str],
    amb: set[str],
    kind: str,
) -> dict[str, str]:
    unique: dict[str, None] = {}
    for item in ids:
        for tok in _split_ids(item):
            unique[tok] = None
    ambiguous_found: set[str] = set()
    result: dict[str, str] = {}
    for tok in unique:
        if tok in amb:
            ambiguous_found.add(tok)
            result[tok] = ""
        else:
            result[tok] = lkp.get(tok, tok)
    _warn_ambiguous(ambiguous_found, kind=kind)
    return result


def _update_dataframe(
    df: pd.DataFrame,
    at: str | list[str] | None,
    suffix: str,
    lkp: dict[str, str],
    amb: set[str],
    kind: str,
    col_label: str,
) -> pd.DataFrame:
    import pandas as pd

    if not isinstance(df, pd.DataFrame):
        raise TypeError(
            f"'{col_label}' must be a str, list[str], or pandas.DataFrame, "
            f"got {type(df).__name__!r}."
        )
    if at is None:
        raise ValueError(
            f"When '{col_label}' is a DataFrame you must specify 'at' "
            "(column name or list of names)."
        )
    columns: list[str] = [at] if isinstance(at, str) else list(at)
    missing = [c for c in columns if c not in df.columns]
    if missing:
        raise KeyError(f"Column(s) not found in DataFrame: {missing}")
    ambiguous_found: set[str] = set()
    result = df.copy()
    for col in columns:
        result[col + suffix] = (
            result[col]
            .astype(str)
            .map(
                lambda cell, _lkp=lkp, _amb=amb, _af=ambiguous_found: _resolve_string(
                    cell, _lkp, _amb, _af
                )
            )
        )
    _warn_ambiguous(ambiguous_found, kind=kind)
    return result


def _resolve_cell_with_hints(
    cell: str,
    lkp: dict[str, str],
    amb: set[str],
    user_aliases: list[str],
    alias_index: dict[str, list[str]],
    token_to_id: dict[str, str] | None,
    ambiguous_found: set[str],
) -> tuple[str, str | None]:
    """Resolve one cell value, using *user_aliases* to solve ambiguities.

    Returns a ``(resolved_value, resolved_id)`` tuple where *resolved_id* is
    the primary ID of the resolved token when the hint identified a secondary
    usage, and ``None`` otherwise.
    """
    sep_match = re.search(r"[|,;\s]", cell)
    sep = sep_match.group(0) if sep_match else ""
    tokens = _split_ids(cell)
    resolved: list[str] = []
    resolved_id: str | None = None
    for tok in tokens:
        if tok in amb:
            hint_tok, hint_id = resolve_ambiguous_with_hints(
                tok, user_aliases, lkp, alias_index, token_to_id
            )
            if hint_tok:
                resolved.append(hint_tok)
                resolved_id = hint_id
            else:
                ambiguous_found.add(tok)
                resolved.append("")
        else:
            resolved_tok = lkp.get(tok, tok)
            resolved.append(resolved_tok)
            if resolved_id is None:
                # In ID mode (token_to_id=None) the resolved token IS the ID;
                # in label mode look up the corresponding primary ID.
                resolved_id = (token_to_id or {}).get(resolved_tok) or resolved_tok
    return sep.join(resolved), resolved_id


def _update_dataframe_with_synonyms(
    df: pd.DataFrame,
    at: str | list[str] | None,
    suffix: str,
    lkp: dict[str, str],
    amb: set[str],
    kind: str,
    col_label: str,
    synonyms_col: str,
    alias_index: dict[str, list[str]],
    token_to_id: dict[str, str] | None,
) -> pd.DataFrame:
    """Like :func:`_update_dataframe` but uses *synonyms_col* for per-row hints."""
    import pandas as pd

    if not isinstance(df, pd.DataFrame):
        raise TypeError(
            f"'{col_label}' must be a str, list[str], or pandas.DataFrame, "
            f"got {type(df).__name__!r}."
        )
    if at is None:
        raise ValueError(
            f"When '{col_label}' is a DataFrame you must specify 'at' "
            "(column name or list of names)."
        )
    columns: list[str] = [at] if isinstance(at, str) else list(at)
    missing = [c for c in columns if c not in df.columns]
    if missing:
        raise KeyError(f"Column(s) not found in DataFrame: {missing}")

    has_syn_col = synonyms_col in df.columns
    result = df.copy()
    for col in columns:
        ambiguous_found: set[str] = set()
        new_values: list[str] = []
        new_ids: list[str | None] = []
        for idx in range(len(result)):
            cell = str(result.iloc[idx][col])
            raw_syn = str(result.iloc[idx][synonyms_col]) if has_syn_col else ""
            user_aliases = _split_ids(raw_syn) if raw_syn.strip() not in {"", "nan"} else []
            val, rid = _resolve_cell_with_hints(
                cell, lkp, amb, user_aliases, alias_index, token_to_id, ambiguous_found
            )
            new_values.append(val)
            new_ids.append(rid)
        result[col + suffix] = new_values
        result[col + suffix + "_id"] = new_ids
        _warn_ambiguous(ambiguous_found, kind=kind)
    return result


@overload
def update_ids(
    ids: str,
    mapping_set: Sec2PriMappingSet,
    *,
    at: None = ...,
    suffix: str = ...,
    lookup: dict[str, str] | None = ...,
    ambiguous: set[str] | None = ...,
) -> dict[str, str]:
    """Update IDs from a string."""
    ...


@overload
def update_ids(
    ids: list[str],
    mapping_set: Sec2PriMappingSet,
    *,
    at: None = ...,
    suffix: str = ...,
    lookup: dict[str, str] | None = ...,
    ambiguous: set[str] | None = ...,
) -> dict[str, str]:
    """Update IDs from list[str]."""
    ...


@overload
def update_ids(
    ids: pd.DataFrame,
    mapping_set: Sec2PriMappingSet,
    *,
    at: str | list[str],
    suffix: str = ...,
    lookup: dict[str, str] | None = ...,
    ambiguous: set[str] | None = ...,
    synonyms: str | list[str] | None = ...,
    label_mapping_set: Sec2PriMappingSet | None = ...,
) -> pd.DataFrame:
    """Update IDs from pd.DataFrame."""
    ...



[docs]
def update_ids(
    ids: IdsInput,
    mapping_set: Sec2PriMappingSet,
    *,
    at: str | list[str] | None = None,
    suffix: str = "_primary",
    lookup: dict[str, str] | None = None,
    ambiguous: set[str] | None = None,
    synonyms: str | list[str] | None = None,
    label_mapping_set: Sec2PriMappingSet | None = None,
) -> dict[str, str] | pd.DataFrame:
    """Resolve secondary identifiers to primary identifiers.

    Parameters
    ----------
    ids:
        One of:

        * **str**: a single identifier, or multiple identifiers joined by
          ``|``, ``,``, ``;``, or whitespace.
        * **list[str]**: a list of identifier strings (each may itself
          contain multiple IDs separated by the delimiters above).
        * **pandas.DataFrame**: a DataFrame; you must also supply *at*.

    mapping_set:
        The :class:`~pysec2pri.parsers.base.Sec2PriMappingSet` to look up
        against (e.g. the result of ``generate_hgnc()``).

    at:
        *DataFrame mode only.* Column name or list of column names that
        contain identifiers.  For each column ``col`` a new column named
        ``col + suffix`` is added to the returned DataFrame.

    suffix:
        Suffix appended to column names in DataFrame mode (default
        ``"_primary"``).

    lookup:
        Pre-built ``{secondary_id: primary_id}`` dictionary.  Pass the
        result of :func:`build_lookup` to avoid rebuilding on repeated
        calls.

    ambiguous:
        Pre-built set of ambiguous IDs (see :func:`build_ambiguous_set`).
        When ``None``, it is computed automatically from *mapping_set*.
        Pass an explicit set (including an empty one) to skip the
        computation.

    synonyms:
        *DataFrame mode only.* Name of a column in the DataFrame that
        contains user-supplied alias strings (delimited by ``|``, ``,``,
        ``;``, or whitespace) to help resolve ambiguous identifiers.
        When provided, :func:`resolve_ambiguous_with_hints` is called for
        every ambiguous cell using that row's alias list.

    label_mapping_set:
        A :class:`~pysec2pri.parsers.base.LabelMappingSet` used to build
        the alias index when *synonyms* is provided.  When ``None`` and
        *synonyms* is set, hint-based resolution is skipped (ambiguous
        IDs remain blank) and a warning is emitted.

    Returns
    -------
    dict[str, str]
        When *ids* is a ``str`` or ``list[str]``: a dictionary mapping
        each **unique** input identifier to its resolved primary ID.
        Identifiers not found in the mapping set are returned unchanged.
        **Ambiguous identifiers are mapped to an empty string** and a
        warning is emitted.

    pandas.DataFrame
        When *ids* is a ``DataFrame``: a copy of the DataFrame with one
        new ``<col><suffix>`` column per entry in *at*.  Ambiguous cells
        are set to ``""``; a warning is emitted after all columns are
        processed.
    """
    lkp = lookup if lookup is not None else _build_lookup(mapping_set)
    amb = ambiguous if ambiguous is not None else build_ambiguous_set(mapping_set)

    if isinstance(ids, str):
        return _update_str(ids, lkp, amb, kind="ID")
    if isinstance(ids, list):
        return _update_list(ids, lkp, amb, kind="ID")

    # DataFrame mode
    if synonyms is not None and isinstance(synonyms, str):
        if label_mapping_set is None:
            logger.warning(
                "update_ids: 'synonyms' column %r specified but no 'label_mapping_set' "
                "was provided, hint-based ambiguity resolution will be skipped.",
                synonyms,
            )
        else:
            alias_index = build_alias_index(label_mapping_set)
            return _update_dataframe_with_synonyms(
                ids,
                at,
                suffix,
                lkp,
                amb,
                kind="ID",
                col_label="ids",
                synonyms_col=synonyms,
                alias_index=alias_index,
                token_to_id=None,  # IDs are their own keys in alias_index
            )
    return _update_dataframe(ids, at, suffix, lkp, amb, kind="ID", col_label="ids")




[docs]
def build_label_lookup(mapping_set: Sec2PriMappingSet) -> dict[str, str]:
    """Return a ``{secondary_label: primary_label}`` dictionary.

    Useful when you want to apply the look-up yourself or cache it for
    repeated calls.

    Args:
        mapping_set: A :class:`~pysec2pri.parsers.base.LabelMappingSet`
            (e.g. the result of ``generate_hgnc_labels()``).

    Returns:
        Dictionary mapping every previous/alias label to its current label.
    """
    lookup: dict[str, str] = {}
    for m in mapping_set.mappings or []:
        sec = str(getattr(m, "subject_label", None) or "")
        pri = str(getattr(m, "object_label", None) or "")
        if sec:
            lookup[sec] = pri
    return lookup



@overload
def update_labels(
    labels: str,
    mapping_set: Sec2PriMappingSet,
    *,
    at: None = ...,
    suffix: str = ...,
    lookup: dict[str, str] | None = ...,
    ambiguous: set[str] | None = ...,
    synonyms: str | list[str] | None = ...,
) -> dict[str, str]:
    """Input as str."""
    ...


@overload
def update_labels(
    labels: list[str],
    mapping_set: Sec2PriMappingSet,
    *,
    at: None = ...,
    suffix: str = ...,
    lookup: dict[str, str] | None = ...,
    ambiguous: set[str] | None = ...,
    synonyms: str | list[str] | None = ...,
) -> dict[str, str]:
    """Input as list[str]."""
    ...


@overload
def update_labels(
    labels: pd.DataFrame,
    mapping_set: Sec2PriMappingSet,
    *,
    at: str | list[str],
    suffix: str = ...,
    lookup: dict[str, str] | None = ...,
    ambiguous: set[str] | None = ...,
    synonyms: str | list[str] | None = ...,
) -> pd.DataFrame:
    """Input as pd.DataFrame."""
    ...



[docs]
def update_labels(
    labels: IdsInput,
    mapping_set: Sec2PriMappingSet,
    *,
    at: str | list[str] | None = None,
    suffix: str = "_current",
    lookup: dict[str, str] | None = None,
    ambiguous: set[str] | None = None,
    synonyms: str | list[str] | None = None,
) -> dict[str, str] | pd.DataFrame:
    """Resolve previous/alias gene labels to current labels.

    Same as :func:`update_ids` but resolves via the
    ``subject_label`` to ``object_label`` mapping rather than IDs.

    Parameters
    ----------
    labels:
        One of:

        * **str**: a single label, or multiple labels joined by
          ``|``, ``,``, ``;``, or whitespace.
        * **list[str]**: a list of label strings.
        * **pandas.DataFrame**: a DataFrame; you must also supply *at*.

    mapping_set:
        A :class:`~pysec2pri.parsers.base.LabelMappingSet`
        (e.g. the result of ``generate_hgnc_labels()``).

    at:
        *DataFrame mode only.* Column name or list of column names that
        contain labels.  For each column ``col`` a new column named
        ``col + suffix`` is added to the returned DataFrame.

    suffix:
        Suffix appended to column names in DataFrame mode (default
        ``"_current"``).

    lookup:
        Pre-built ``{previous_label: current_label}`` dictionary.
        Pass the result of :func:`build_label_lookup` to avoid rebuilding
        on repeated calls.

    ambiguous:
        Pre-built set of ambiguous labels (see
        :func:`build_ambiguous_labels_set`).  When ``None``, it is
        computed automatically from *mapping_set*.

    synonyms:
        *DataFrame mode only.* Name of a column in the DataFrame that
        contains user-supplied alias strings (delimited by ``|``, ``,``,
        ``;``, or whitespace) to help resolve ambiguous labels.
        When provided, :func:`resolve_ambiguous_with_hints` is called for
        every ambiguous cell using that row's alias list.  The alias index
        is built from *mapping_set* itself (non-IAO entries).

    Returns
    -------
    dict[str, str]
        When *labels* is a ``str`` or ``list[str]``: a dictionary mapping
        each unique input label to its resolved current label.  Symbols
        not found in the mapping set are returned unchanged.  **Ambiguous
        labels are mapped to an empty string** and a warning is emitted.

    pandas.DataFrame
        When *labels* is a ``DataFrame``: a copy of the DataFrame with one
        new ``<col><suffix>`` column per entry in *at*.  Ambiguous cells
        are set to ``""``; a warning is emitted after all columns are
        processed.
    """
    lkp = lookup if lookup is not None else build_label_lookup(mapping_set)
    amb = ambiguous if ambiguous is not None else build_ambiguous_labels_set(mapping_set)

    if isinstance(labels, str):
        return _update_str(labels, lkp, amb, kind="label")
    if isinstance(labels, list):
        return _update_list(labels, lkp, amb, kind="label")

    # DataFrame mode
    if synonyms is not None and isinstance(synonyms, str):
        alias_index = build_alias_index(mapping_set)
        token_to_id = build_primary_token_to_id(mapping_set)
        return _update_dataframe_with_synonyms(
            labels,
            at,
            suffix,
            lkp,
            amb,
            kind="label",
            col_label="labels",
            synonyms_col=synonyms,
            alias_index=alias_index,
            token_to_id=token_to_id,
        )
    return _update_dataframe(labels, at, suffix, lkp, amb, kind="label", col_label="labels")



__all__ = [
    "build_alias_index",
    "build_ambiguous_labels_set",
    "build_ambiguous_set",
    "build_label_lookup",
    "build_lookup",
    "build_primary_token_to_id",
    "resolve_ambiguous_with_hints",
    "update_ids",
    "update_labels",
]