Source code for archivum.bibtex

"""
Functions for creating bibtex entries from various iterables.

v2  Hack off gemini, which actually was very poor for task at hand.
v1  Gemini.
"""
from functools import partial
import logging
import re
from pathlib import Path
from textwrap import wrap
from typing import Any, Callable, Iterable, List
import pandas as pd

logger = logging.getLogger(__name__)



[docs]
def sanitize_for_latex(val: Any) -> str:
    """Sanitize string for LaTeX compatibility."""
    if pd.isna(val):
        return ""
    
    # Handle numbers: convert 2017.0 to 2017
    if isinstance(val, (float, int)):
        if isinstance(val, float) and val.is_integer():
            return str(int(val))
        return str(val)

    s = str(val)
    
    # 1. Nasty unicode dashes -> LaTeX dashes
    s = s.replace('–', '--').replace('—', '---')
    
    # 2. LaTeX Special Characters (only if not already escaped)
    # We use a negative lookbehind to avoid double escaping
    # Handling &, %, _, #, { }
    s = re.sub(r'(?<!\\)&', r'\&', s)
    s = re.sub(r'(?<!\\)%', r'\%', s)
    s = re.sub(r'(?<!\\)_', r'\_', s)
    s = re.sub(r'(?<!\\)#', r'\#', s)
    
    return s



BIBTEX_ALLOWED_TYPES = {
    "article",
    "book",
    "techreport",
    "misc",
    "incollection",
    "unpublished",
    "inproceedings",
    "phdthesis",
}



[docs]
def format_mendeley_file(path: Any) -> str:
    """
    Format a path using Mendeley's BibTeX file-field convention.

    Windows paths are rendered as ``:C\\:/path/to/file.pdf:pdf``. Non-Windows
    paths keep their absolute/relative POSIX form and still include the final
    file type segment.
    """
    if path is None or pd.isna(path):
        return ""

    p = Path(path)
    suffix = p.suffix.lower().lstrip(".") or "pdf"
    posix_path = p.as_posix()

    if p.drive:
        return f":{p.drive[0]}\\:{posix_path[2:]}:{suffix}"
    if re.match(r"^[A-Za-z]:/", posix_path):
        return f":{posix_path[0]}\\:{posix_path[2:]}:{suffix}"
    return f":{posix_path}:{suffix}"




[docs]
def dict_to_bibtex(data: Any, allowed_fields: List[str] = None, raw_fields: Iterable[str] = None) -> str:
    """
    Converts a dict-like object to a sanitized BibTeX string.

    ``raw_fields`` bypasses LaTeX sanitization for fields where the literal
    value matters, such as Mendeley ``file`` paths.
    """
    if data is None:
        return ""

    # Handle pandas objects
    if hasattr(data, "to_dict"):
        data = data.to_dict()
    
    # Handle NamedTuple (often returned by itertuples)
    if hasattr(data, "_asdict"):
        data = data._asdict()

    if not isinstance(data, dict):
        return ""

    # Standard header fields
    bib_type = str(data.get('type', 'article')).lower()
    if bib_type not in BIBTEX_ALLOWED_TYPES:
        bib_type = "misc"
    cite_key = str(data.get('tag', 'unknown'))
    raw_fields = set(raw_fields or [])

    # Determine which fields to process
    if allowed_fields:
        # Use whitelist, excluding type/tag which are in the header
        keys = [k for k in allowed_fields if k not in {'type', 'tag'}]
    else:
        # Fallback: process all fields except blacklisted ones
        keys = [k for k in data.keys() if k not in {'type', 'tag'} 
                and not k.startswith(('arc-', 'mendeley-'))
                and k != 'merge_count']

    # Filter out empty/NaN and sanitize
    processed_data = {}
    for k in keys:
        v = data.get(k)
        if pd.isna(v) or str(v).strip() in ("", "nan"):
            continue
        
        sanitized_v = str(v) if k in raw_fields else sanitize_for_latex(v)
        if sanitized_v:
            # Title preservation: wrap in double braces if it's a title/journal
            # but ONLY if not already braced.
            # We check for a single '{' at start to avoid triple bracing {{ { ... } }}
            if k in ('title', 'journal', 'booktitle') and not str(sanitized_v).startswith('{'):
                processed_data[k] = f"{{{sanitized_v}}}"
            else:
                processed_data[k] = sanitized_v

    if not processed_data:
        return ""

    max_len = max(len(k) for k in processed_data)

    lines = [f"@{bib_type}{{{cite_key},"]
    for k, v in processed_data.items():
        padding = " " * (max_len - len(k))
        lines.append(f"  {k}{padding} = {{{v}}},")
    lines.append("}")

    return "\n".join(lines)




[docs]
def rows_to_bibtex(
    rows: Any,
    allowed_fields: List[str] = None,
    *,
    include_hash: bool = False,
    include_file: bool = False,
    path_resolver: Callable[[Any], Path] | None = None,
) -> str:
    """
    Convert dataframe-like rows to BibTeX text using ``dict_to_bibtex``.

    This is the shared path for library-level and ad hoc web exports. The
    optional ``include_hash`` and ``include_file`` flags produce Archivum's
    enriched BibTeX+ export without changing normal library BibTeX output.
    """
    if rows is None:
        return ""

    if isinstance(rows, pd.DataFrame):
        source = _dedupe_bibtex_dataframe(rows)
        records = [row.to_dict() for _, row in source.iterrows()]
    else:
        records = []
        for row in rows:
            if hasattr(row, "to_dict"):
                row = row.to_dict()
            if hasattr(row, "_asdict"):
                row = row._asdict()
            if isinstance(row, dict):
                records.append(row)

    fields = list(allowed_fields or [])
    raw_fields = set()
    if include_hash and "hash" not in fields:
        fields.append("hash")
    if include_file:
        if "file" not in fields:
            fields.append("file")
        raw_fields.add("file")

    entries = []
    for record in records:
        record = dict(record)
        if include_file and record.get("path"):
            file_path = record["path"]
            if path_resolver is not None:
                file_path = path_resolver(file_path)
            record["file"] = format_mendeley_file(file_path)
        entry = dict_to_bibtex(record, allowed_fields=fields, raw_fields=raw_fields)
        if entry:
            entries.append(entry)
    return "\n\n".join(entries)



def _dedupe_bibtex_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Avoid duplicate BibTeX entries from exploded author/database rows."""
    for column in ("tag", "hash"):
        if column in df.columns:
            present = df[column].notna() & (df[column].astype(str).str.strip() != "")
            keyed = df[present].drop_duplicates(subset=[column], keep="first")
            unkeyed = df[~present]
            return pd.concat([keyed, unkeyed], ignore_index=True)
    return df



[docs]
def bibtex_to_dict(bibtex_str: str) -> dict[str, dict[str, str]]:
    """
    Very simple BibTeX parser for a single entry.
    Returns {tag: {field: value, 'type': entry_type}}
    """
    if not bibtex_str:
        return {}

    # Normalize
    entry = bibtex_str.strip().replace('\r\n', '\n').replace('\r', '\n')

    # Header: @type{tag,
    header_match = re.match(r"\s*@?([a-zA-Z0-9\.\\\-_]+)\s*\{\s*([^,]+),", entry)
    if not header_match:
        return {}

    entry_type, tag = header_match.groups()
    result = {"type": entry_type}

    # Body
    body = entry[header_match.end():].strip()
    if body.endswith('}'):
        body = body[:-1]

    # Fields: key = {value} or key = "value"
    # This regex is simplified but covers most cases
    # Added [a-zA-Z0-9\-_] to keys to be more robust
    field_pattern = r"\s*([a-zA-Z0-9\-_]+)\s*=\s*[\{\"](.*?)[\}\"]\s*,?\s*(?:\n|\Z)"
    for m in re.finditer(field_pattern, body, flags=re.DOTALL):
        k, v = m.groups()
        result[k.lower()] = v.strip()

    return {tag: result}




[docs]
def dict_to_bibtex_crossref(data: Any) -> str:
    """
    Converts a dict-like object to a BibTeX string.

    Suitable for the return value form cross ref.

    Gemini code.
    """
    if data is None:
        return ""

    if hasattr(data, "to_dict"):
        data = data.to_dict()
    if hasattr(data, "_asdict"):
        data = data._asdict()

    if not isinstance(data, dict):
        return ""

    def get_list_safe(key: str) -> str:
        val = data.get(key)
        if isinstance(val, list) and val:
            return str(val[0])
        return str(val) if val else ""

    ctype = data.get('type', 'misc')
    type_map = {
        'article': 'article',
        'book': 'book',
        'techreport': 'techreport',
        'misc': 'misc',
        'incollection': 'incollection',
        'inproceedings': 'inproceedings',
        'phdthesis': 'phdthesis',
        'journal-article': 'article',
        'book-chapter': 'incollection',
        'proceedings-article': 'inproceedings',
        'monograph': 'book',
        'report': 'techreport',
        'dissertation': 'phdthesis'
    }

    bib_type = type_map.get(ctype, 'misc')

    authors = data.get('author', [])
    formatted_authors = []
    first_author_family = "Unknown"

    if authors and isinstance(authors, list):
        first_author_family = authors[0].get('family', 'Unknown')
        for auth in authors:
            family = auth.get('family')
            given = auth.get('given')
            if family and given:
                formatted_authors.append(f"{family}, {given}")
            elif family:
                formatted_authors.append(family)
            elif 'name' in auth:
                formatted_authors.append(auth['name'])

    author_str = " and ".join(formatted_authors)

    date_parts = (
        data.get('published-print', {}).get('date-parts') or
        data.get('published-online', {}).get('date-parts') or
        data.get('created', {}).get('date-parts')
    )
    year = str(date_parts[0][0]) if date_parts and date_parts[0] else "nd"

    safe_family = "".join(filter(str.isalnum, first_author_family))
    cite_key = f"{safe_family}{year}"

    if (isbn := data.get("ISBN")):
        isbn = isbn[0]
    else:
        isbn = None

    fields = {
        'author': author_str,
        'title': get_list_safe('title'),
        'journal': get_list_safe('container-title'),
        'year': year,
        'volume': data.get('volume'),
        'number': data.get('issue'),
        'pages': data.get('page'),
        'doi': data.get('DOI'),
        'publisher': data.get('publisher'),
        'url': data.get('URL'),
        'isbn': isbn,
    }

    active_fields = {k: v for k, v in fields.items() if v}
    if not active_fields:
        return ""

    max_len = max(len(k) for k in active_fields)

    lines = [f"@{bib_type}{{{cite_key},"]
    for k, v in active_fields.items():
        clean_val = str(v).replace('&', '\\&').replace('%', '\\%').replace('_', '\\_')
        padding = " " * (max_len - len(k))
        lines.append(f"  {k}{padding} = {{{clean_val}}},")
    lines.append("}")

    return "\n".join(lines)