Source code for atlas_doc_parser.mark_or_node

# -*- coding: utf-8 -*-

"""
Base classes for ADF data model.

This module provides the foundational classes for deserializing Atlassian Document
Format (ADF) JSON into Python objects:

- ``Base``: Base class for all dataclasses
- ``BaseMark``: Base class for text marks (formatting)
- ``BaseNode``: Base class for document nodes
"""

import typing as T
import copy
import dataclasses

from func_args.api import BaseFrozenModel, REQ, OPT, remove_optional

from .type_hint import T_DATA
from .type_enum import TypeEnum, check_type_match
from .exc import UnimplementedTypeError
from .logger import logger

from . import settings


T_FIELDS = dict[str, dataclasses.Field]
_CLASS_FIELD: dict[T.Any, T_FIELDS] = {}  # class fields cache



[docs]
@dataclasses.dataclass(frozen=True)
class Base(BaseFrozenModel):
    """
    Base class for all ADF dataclasses.

    Provides common functionality:
    - ``from_dict()``: Deserialize from dictionary
    - ``to_dict()``: Serialize to dictionary
    """


[docs]
    @classmethod
    def get_fields(cls) -> T_FIELDS:
        """
        Get the dict view of the ``dataclasses.Field`` in this class.
        It leverages the cache to avoid the overhead of ``dataclasses.fields``
        function call.
        """
        try:
            return _CLASS_FIELD[cls]
        except KeyError:
            _CLASS_FIELD[cls] = {field.name: field for field in dataclasses.fields(cls)}
            return _CLASS_FIELD[cls]



[docs]
    def to_dict(self) -> T_DATA:
        """
        Convert the dataclass to a complete dictionary with all fields.
        """
        return remove_optional(**dataclasses.asdict(self))



[docs]
    def to_kwargs(self) -> T_DATA:
        """
        Convert the dataclass to a dictionary suitable for function calls.
        """
        return self.to_dict()



[docs]
    @classmethod
    def from_dict(cls, dct: T_DATA) -> "Base":
        """
        Construct an instance from a dictionary.

        Only fields defined in the dataclass will be used.

        This is a defensive programming practice: it ensures that only fields defined
        in the dataclass are used when constructing an instance from a dictionary.
        This is important because the Atlassian Document Format may introduce
        new fields over time, and if the library is outdated, unexpected fields
        could be present in the input data. By ignoring unknown fields, the code
        remains robust and avoids errors due to schema changes.
        """
        _fields = cls.get_fields()
        kwargs = {}
        for field_name, field in _fields.items():
            try:
                kwargs[field_name] = dct[field_name]
            except KeyError:
                pass
        return cls(**kwargs)


    def is_opt(self, value: T.Any) -> bool:
        return value is OPT



T_BASE = T.TypeVar("T_BASE", bound=Base)



[docs]
@dataclasses.dataclass(frozen=True)
class BaseMarkOrNode(Base):
    """
    Base class for ADF marks and nodes.
    """

    type: str = dataclasses.field(default_factory=REQ)


[docs]
    def is_type_of(
        self,
        expected_types: TypeEnum | list[TypeEnum],
    ) -> bool:
        """
        Check if this element's type matches one or more expected types.

        :param expected_types: A single TypeEnum member or list of TypeEnum members
            to match against. If a list is provided, returns True if this element's
            type matches ANY of the expected types.

        :return: True if this element's type matches (any of) the expected type(s).

        Example::

            >>> BaseNode(...).is_type_of(TypeEnum.paragraph)
            True
            >>> BaseMark(...).is_type_of([TypeEnum.strong, TypeEnum.em])
            True
        """
        return check_type_match(self.type, expected_types)




# =============================================================================
# BaseMark Class
# =============================================================================

[docs]
@dataclasses.dataclass(frozen=True)
class BaseMark(BaseMarkOrNode):
    """
    Base class for ADF marks (text formatting).

    Marks represent formatting applied to text nodes, such as:
    - ``strong`` (bold)
    - ``em`` (italic)
    - ``link`` (hyperlink)
    - ``code`` (inline code)

    Subclasses should override ``to_markdown()`` to provide format conversion.
    """


[docs]
    @classmethod
    def from_dict(cls: T.Type["T_MARK"], dct: T_DATA) -> "T_MARK":
        """
        Deserialize from dictionary.

        Handles nested ``attrs`` deserialization if the subclass defines an
        ``attrs`` field with a type that has ``from_dict()``.
        """
        dct = copy.deepcopy(dct)
        if "attrs" in dct:
            fields = cls.get_fields()
            if "attrs" in fields:
                attrs_field = fields["attrs"]
                # Check if attrs field type has from_dict method
                if hasattr(attrs_field.type, "from_dict"):
                    dct["attrs"] = attrs_field.type.from_dict(dct["attrs"])
        return super().from_dict(dct)



[docs]
    def to_dict(self) -> T_DATA:
        """Serialize to dictionary, handling nested attrs."""
        data = super().to_dict()
        if "attrs" in data and hasattr(data["attrs"], "to_dict"):
            data["attrs"] = remove_optional(**data["attrs"])
        return data



[docs]
    def to_markdown(self, text: str) -> str:
        """
        Apply this mark's formatting to text.

        The default implementation returns the input text unchanged. This design
        reflects the library's philosophy:

        1. **Content extraction over formatting.** In the AI era, we convert ADF
           to Markdown primarily to extract textual content for LLMs, RAG systems,
           and knowledge bases. Preserving formatting is secondary to preserving
           content.

        2. **When in doubt, preserve content without formatting.** If a mark type
           doesn't have a standard Markdown equivalent (e.g., background color),
           we return the raw text rather than inventing custom syntax or losing
           the content entirely.

        3. **Use native Markdown only.** We prefer standard Markdown syntax
           (``**bold**``, ``*italic*``). Dialect-specific extensions are avoided.

        Subclasses override this method to apply formatting. For example,
        ``MarkStrong.to_markdown("text")`` returns ``"**text**"``.

        :param text: The text content to format.
        :return: The formatted text. Default returns text unchanged.
        """
        return text




T_MARK = T.TypeVar("T_MARK", bound=BaseMark)


# =============================================================================
# BaseNode Class
# =============================================================================

[docs]
@dataclasses.dataclass(frozen=True)
class BaseNode(BaseMarkOrNode):
    """
    Base class for ADF nodes (document structure elements).

    Nodes represent structural elements of the document, such as:
    - Block nodes: ``paragraph``, ``heading``, ``codeBlock``, ``table``
    - Inline nodes: ``text``, ``mention``, ``emoji``

    Nodes can contain:
    - ``attrs``: Node-specific attributes
    - ``content``: Child nodes (for container nodes)
    - ``marks``: Text formatting (for inline nodes)

    Subclasses should override ``to_markdown()`` to provide format conversion.
    """


[docs]
    @classmethod
    def from_dict(
        cls: T.Type["T_NODE"],
        dct: T_DATA,
    ) -> "T_NODE":
        """
        Deserialize from dictionary.

        Handles nested deserialization of:
        - ``attrs``: Using the field type's ``from_dict()``
        - ``content``: Using ``parse_node()`` for each child
        - ``marks``: Using ``parse_mark()`` for each mark

        Unimplemented node/mark types are gracefully skipped with an optional
        warning (controlled by ``settings.WARN_UNIMPLEMENTED_TYPE``).
        Other parsing errors are propagated normally.
        """
        from .marks.parse_mark import parse_mark
        from .nodes.parse_node import parse_node

        dct = copy.deepcopy(dct)

        # Deserialize attrs
        if "attrs" in dct:
            fields = cls.get_fields()
            if "attrs" in fields:
                attrs_field = fields["attrs"]
                if hasattr(attrs_field.type, "from_dict"):
                    dct["attrs"] = attrs_field.type.from_dict(dct["attrs"])

        # Deserialize content (child nodes)
        if "content" in dct:
            if isinstance(dct["content"], list) and parse_node is not None:
                new_content = []
                for d in dct["content"]:
                    try:
                        content = parse_node(d)
                        new_content.append(content)
                    except UnimplementedTypeError as e:
                        # Skip unimplemented node types gracefully
                        if settings.WARN_UNIMPLEMENTED_TYPE:
                            logger.warning(str(e))
                        # Skip this node and continue
                    # Other exceptions propagate normally
                dct["content"] = new_content

        # Deserialize marks
        if "marks" in dct:
            if isinstance(dct["marks"], list) and parse_mark is not None:
                new_marks = []
                for d in dct["marks"]:
                    try:
                        mark = parse_mark(d)
                        new_marks.append(mark)
                    except UnimplementedTypeError as e:
                        # Skip unimplemented mark types gracefully
                        if settings.WARN_UNIMPLEMENTED_TYPE:
                            logger.warning(str(e))
                        # Skip this mark and continue
                    # Other exceptions propagate normally
                dct["marks"] = new_marks

        return super().from_dict(dct)



[docs]
    def to_dict(self) -> T_DATA:
        """Serialize to dictionary, handling nested attrs, content, and marks."""
        # Build dict directly without modifying the frozen instance
        data = dataclasses.asdict(self)

        # Serialize attrs
        if "attrs" in data and data["attrs"] is not OPT:
            if hasattr(self.attrs, "to_dict"):
                data["attrs"] = self.attrs.to_dict()

        # Serialize content
        if "content" in data and data["content"] is not OPT:
            data["content"] = [c.to_dict() for c in self.content]

        # Serialize marks
        if "marks" in data and data["marks"] is not OPT:
            data["marks"] = [m.to_dict() for m in self.marks]

        return remove_optional(**data)



[docs]
    def to_markdown(self, ignore_error: bool = False) -> str:
        """
        Convert this node to Markdown format.

        The default implementation raises ``NotImplementedError``. This is
        intentional for several reasons:

        1. **Fail fast during development.** When implementing new node types,
           we want to immediately discover which nodes haven't implemented
           ``to_markdown()`` rather than silently producing empty output or
           skipping content. This helps catch missing implementations early.

        2. **The ignore_error parameter provides an escape hatch.** In production,
           if our code has bugs or a node type is partially implemented, users
           can pass ``ignore_error=True`` to gracefully skip nodes that fail
           to convert. This flag should be propagated recursively to all nested
           ``to_markdown()`` calls via helper functions like ``content_to_markdown()``.

        3. **Error handling is explicit.** The library user decides whether to
           fail fast (for debugging and development) or degrade gracefully
           (for production use cases where partial output is acceptable).

        Subclasses must override this method to provide actual conversion logic.

        :param ignore_error: If True, errors in nested conversions are silently
            skipped. If False (default), errors propagate immediately. This flag
            should be passed down to any nested ``to_markdown()`` calls.
        :return: The Markdown representation of this node.
        :raises NotImplementedError: Always raised by the base class to ensure
            subclasses implement this method.
        """
        raise NotImplementedError(
            f"{self.__class__.__name__} has not implemented ``to_markdown()``"
        )




T_NODE = T.TypeVar("T_NODE", bound=BaseNode)