Source code for atlas_doc_parser.mark_or_node

# -*- coding: utf-8 -*-

"""
Base classes for ADF data model.

This module provides the foundational classes for deserializing Atlassian Document
Format (ADF) JSON into Python objects:

- ``Base``: Base class for all dataclasses
- ``BaseMark``: Base class for text marks (formatting)
- ``BaseNode``: Base class for document nodes
"""

import typing as T
import copy
import dataclasses

from func_args.api import BaseFrozenModel, REQ, OPT, remove_optional

from .type_hint import T_DATA
from .type_enum import TypeEnum, check_type_match
from .exc import UnimplementedTypeError
from .logger import logger

from . import settings


T_FIELDS = dict[str, dataclasses.Field]
_CLASS_FIELD: dict[T.Any, T_FIELDS] = {}  # class fields cache


[docs] @dataclasses.dataclass(frozen=True) class Base(BaseFrozenModel): """ Base class for all ADF dataclasses. Provides common functionality: - ``from_dict()``: Deserialize from dictionary - ``to_dict()``: Serialize to dictionary """
[docs] @classmethod def get_fields(cls) -> T_FIELDS: """ Get the dict view of the ``dataclasses.Field`` in this class. It leverages the cache to avoid the overhead of ``dataclasses.fields`` function call. """ try: return _CLASS_FIELD[cls] except KeyError: _CLASS_FIELD[cls] = {field.name: field for field in dataclasses.fields(cls)} return _CLASS_FIELD[cls]
[docs] def to_dict(self) -> T_DATA: """ Convert the dataclass to a complete dictionary with all fields. """ return remove_optional(**dataclasses.asdict(self))
[docs] def to_kwargs(self) -> T_DATA: """ Convert the dataclass to a dictionary suitable for function calls. """ return self.to_dict()
[docs] @classmethod def from_dict(cls, dct: T_DATA) -> "Base": """ Construct an instance from a dictionary. Only fields defined in the dataclass will be used. This is a defensive programming practice: it ensures that only fields defined in the dataclass are used when constructing an instance from a dictionary. This is important because the Atlassian Document Format may introduce new fields over time, and if the library is outdated, unexpected fields could be present in the input data. By ignoring unknown fields, the code remains robust and avoids errors due to schema changes. """ _fields = cls.get_fields() kwargs = {} for field_name, field in _fields.items(): try: kwargs[field_name] = dct[field_name] except KeyError: pass return cls(**kwargs)
def is_opt(self, value: T.Any) -> bool: return value is OPT
T_BASE = T.TypeVar("T_BASE", bound=Base)
[docs] @dataclasses.dataclass(frozen=True) class BaseMarkOrNode(Base): """ Base class for ADF marks and nodes. """ type: str = dataclasses.field(default_factory=REQ)
[docs] def is_type_of( self, expected_types: TypeEnum | list[TypeEnum], ) -> bool: """ Check if this element's type matches one or more expected types. :param expected_types: A single TypeEnum member or list of TypeEnum members to match against. If a list is provided, returns True if this element's type matches ANY of the expected types. :return: True if this element's type matches (any of) the expected type(s). Example:: >>> BaseNode(...).is_type_of(TypeEnum.paragraph) True >>> BaseMark(...).is_type_of([TypeEnum.strong, TypeEnum.em]) True """ return check_type_match(self.type, expected_types)
# ============================================================================= # BaseMark Class # =============================================================================
[docs] @dataclasses.dataclass(frozen=True) class BaseMark(BaseMarkOrNode): """ Base class for ADF marks (text formatting). Marks represent formatting applied to text nodes, such as: - ``strong`` (bold) - ``em`` (italic) - ``link`` (hyperlink) - ``code`` (inline code) Subclasses should override ``to_markdown()`` to provide format conversion. """
[docs] @classmethod def from_dict(cls: T.Type["T_MARK"], dct: T_DATA) -> "T_MARK": """ Deserialize from dictionary. Handles nested ``attrs`` deserialization if the subclass defines an ``attrs`` field with a type that has ``from_dict()``. """ dct = copy.deepcopy(dct) if "attrs" in dct: fields = cls.get_fields() if "attrs" in fields: attrs_field = fields["attrs"] # Check if attrs field type has from_dict method if hasattr(attrs_field.type, "from_dict"): dct["attrs"] = attrs_field.type.from_dict(dct["attrs"]) return super().from_dict(dct)
[docs] def to_dict(self) -> T_DATA: """Serialize to dictionary, handling nested attrs.""" data = super().to_dict() if "attrs" in data and hasattr(data["attrs"], "to_dict"): data["attrs"] = remove_optional(**data["attrs"]) return data
[docs] def to_markdown(self, text: str) -> str: """ Apply this mark's formatting to text. The default implementation returns the input text unchanged. This design reflects the library's philosophy: 1. **Content extraction over formatting.** In the AI era, we convert ADF to Markdown primarily to extract textual content for LLMs, RAG systems, and knowledge bases. Preserving formatting is secondary to preserving content. 2. **When in doubt, preserve content without formatting.** If a mark type doesn't have a standard Markdown equivalent (e.g., background color), we return the raw text rather than inventing custom syntax or losing the content entirely. 3. **Use native Markdown only.** We prefer standard Markdown syntax (``**bold**``, ``*italic*``). Dialect-specific extensions are avoided. Subclasses override this method to apply formatting. For example, ``MarkStrong.to_markdown("text")`` returns ``"**text**"``. :param text: The text content to format. :return: The formatted text. Default returns text unchanged. """ return text
T_MARK = T.TypeVar("T_MARK", bound=BaseMark) # ============================================================================= # BaseNode Class # =============================================================================
[docs] @dataclasses.dataclass(frozen=True) class BaseNode(BaseMarkOrNode): """ Base class for ADF nodes (document structure elements). Nodes represent structural elements of the document, such as: - Block nodes: ``paragraph``, ``heading``, ``codeBlock``, ``table`` - Inline nodes: ``text``, ``mention``, ``emoji`` Nodes can contain: - ``attrs``: Node-specific attributes - ``content``: Child nodes (for container nodes) - ``marks``: Text formatting (for inline nodes) Subclasses should override ``to_markdown()`` to provide format conversion. """
[docs] @classmethod def from_dict( cls: T.Type["T_NODE"], dct: T_DATA, ) -> "T_NODE": """ Deserialize from dictionary. Handles nested deserialization of: - ``attrs``: Using the field type's ``from_dict()`` - ``content``: Using ``parse_node()`` for each child - ``marks``: Using ``parse_mark()`` for each mark Unimplemented node/mark types are gracefully skipped with an optional warning (controlled by ``settings.WARN_UNIMPLEMENTED_TYPE``). Other parsing errors are propagated normally. """ from .marks.parse_mark import parse_mark from .nodes.parse_node import parse_node dct = copy.deepcopy(dct) # Deserialize attrs if "attrs" in dct: fields = cls.get_fields() if "attrs" in fields: attrs_field = fields["attrs"] if hasattr(attrs_field.type, "from_dict"): dct["attrs"] = attrs_field.type.from_dict(dct["attrs"]) # Deserialize content (child nodes) if "content" in dct: if isinstance(dct["content"], list) and parse_node is not None: new_content = [] for d in dct["content"]: try: content = parse_node(d) new_content.append(content) except UnimplementedTypeError as e: # Skip unimplemented node types gracefully if settings.WARN_UNIMPLEMENTED_TYPE: logger.warning(str(e)) # Skip this node and continue # Other exceptions propagate normally dct["content"] = new_content # Deserialize marks if "marks" in dct: if isinstance(dct["marks"], list) and parse_mark is not None: new_marks = [] for d in dct["marks"]: try: mark = parse_mark(d) new_marks.append(mark) except UnimplementedTypeError as e: # Skip unimplemented mark types gracefully if settings.WARN_UNIMPLEMENTED_TYPE: logger.warning(str(e)) # Skip this mark and continue # Other exceptions propagate normally dct["marks"] = new_marks return super().from_dict(dct)
[docs] def to_dict(self) -> T_DATA: """Serialize to dictionary, handling nested attrs, content, and marks.""" # Build dict directly without modifying the frozen instance data = dataclasses.asdict(self) # Serialize attrs if "attrs" in data and data["attrs"] is not OPT: if hasattr(self.attrs, "to_dict"): data["attrs"] = self.attrs.to_dict() # Serialize content if "content" in data and data["content"] is not OPT: data["content"] = [c.to_dict() for c in self.content] # Serialize marks if "marks" in data and data["marks"] is not OPT: data["marks"] = [m.to_dict() for m in self.marks] return remove_optional(**data)
[docs] def to_markdown(self, ignore_error: bool = False) -> str: """ Convert this node to Markdown format. The default implementation raises ``NotImplementedError``. This is intentional for several reasons: 1. **Fail fast during development.** When implementing new node types, we want to immediately discover which nodes haven't implemented ``to_markdown()`` rather than silently producing empty output or skipping content. This helps catch missing implementations early. 2. **The ignore_error parameter provides an escape hatch.** In production, if our code has bugs or a node type is partially implemented, users can pass ``ignore_error=True`` to gracefully skip nodes that fail to convert. This flag should be propagated recursively to all nested ``to_markdown()`` calls via helper functions like ``content_to_markdown()``. 3. **Error handling is explicit.** The library user decides whether to fail fast (for debugging and development) or degrade gracefully (for production use cases where partial output is acceptable). Subclasses must override this method to provide actual conversion logic. :param ignore_error: If True, errors in nested conversions are silently skipped. If False (default), errors propagate immediately. This flag should be passed down to any nested ``to_markdown()`` calls. :return: The Markdown representation of this node. :raises NotImplementedError: Always raised by the base class to ensure subclasses implement this method. """ raise NotImplementedError( f"{self.__class__.__name__} has not implemented ``to_markdown()``" )
T_NODE = T.TypeVar("T_NODE", bound=BaseNode)