"""
Functions for creating bibtex entries from various iterables.
v2 Hack off gemini, which actually was very poor for task at hand.
v1 Gemini.
"""
from functools import partial
import logging
import re
from pathlib import Path
from textwrap import wrap
from typing import Any, Callable, Iterable, List
import pandas as pd
logger = logging.getLogger(__name__)
[docs]
def sanitize_for_latex(val: Any) -> str:
"""Sanitize string for LaTeX compatibility."""
if pd.isna(val):
return ""
# Handle numbers: convert 2017.0 to 2017
if isinstance(val, (float, int)):
if isinstance(val, float) and val.is_integer():
return str(int(val))
return str(val)
s = str(val)
# 1. Nasty unicode dashes -> LaTeX dashes
s = s.replace('–', '--').replace('—', '---')
# 2. LaTeX Special Characters (only if not already escaped)
# We use a negative lookbehind to avoid double escaping
# Handling &, %, _, #, { }
s = re.sub(r'(?<!\\)&', r'\&', s)
s = re.sub(r'(?<!\\)%', r'\%', s)
s = re.sub(r'(?<!\\)_', r'\_', s)
s = re.sub(r'(?<!\\)#', r'\#', s)
return s
BIBTEX_ALLOWED_TYPES = {
"article",
"book",
"techreport",
"misc",
"incollection",
"unpublished",
"inproceedings",
"phdthesis",
}
[docs]
def dict_to_bibtex(data: Any, allowed_fields: List[str] = None, raw_fields: Iterable[str] = None) -> str:
"""
Converts a dict-like object to a sanitized BibTeX string.
``raw_fields`` bypasses LaTeX sanitization for fields where the literal
value matters, such as Mendeley ``file`` paths.
"""
if data is None:
return ""
# Handle pandas objects
if hasattr(data, "to_dict"):
data = data.to_dict()
# Handle NamedTuple (often returned by itertuples)
if hasattr(data, "_asdict"):
data = data._asdict()
if not isinstance(data, dict):
return ""
# Standard header fields
bib_type = str(data.get('type', 'article')).lower()
if bib_type not in BIBTEX_ALLOWED_TYPES:
bib_type = "misc"
cite_key = str(data.get('tag', 'unknown'))
raw_fields = set(raw_fields or [])
# Determine which fields to process
if allowed_fields:
# Use whitelist, excluding type/tag which are in the header
keys = [k for k in allowed_fields if k not in {'type', 'tag'}]
else:
# Fallback: process all fields except blacklisted ones
keys = [k for k in data.keys() if k not in {'type', 'tag'}
and not k.startswith(('arc-', 'mendeley-'))
and k != 'merge_count']
# Filter out empty/NaN and sanitize
processed_data = {}
for k in keys:
v = data.get(k)
if pd.isna(v) or str(v).strip() in ("", "nan"):
continue
sanitized_v = str(v) if k in raw_fields else sanitize_for_latex(v)
if sanitized_v:
# Title preservation: wrap in double braces if it's a title/journal
# but ONLY if not already braced.
# We check for a single '{' at start to avoid triple bracing {{ { ... } }}
if k in ('title', 'journal', 'booktitle') and not str(sanitized_v).startswith('{'):
processed_data[k] = f"{{{sanitized_v}}}"
else:
processed_data[k] = sanitized_v
if not processed_data:
return ""
max_len = max(len(k) for k in processed_data)
lines = [f"@{bib_type}{{{cite_key},"]
for k, v in processed_data.items():
padding = " " * (max_len - len(k))
lines.append(f" {k}{padding} = {{{v}}},")
lines.append("}")
return "\n".join(lines)
[docs]
def rows_to_bibtex(
rows: Any,
allowed_fields: List[str] = None,
*,
include_hash: bool = False,
include_file: bool = False,
path_resolver: Callable[[Any], Path] | None = None,
) -> str:
"""
Convert dataframe-like rows to BibTeX text using ``dict_to_bibtex``.
This is the shared path for library-level and ad hoc web exports. The
optional ``include_hash`` and ``include_file`` flags produce Archivum's
enriched BibTeX+ export without changing normal library BibTeX output.
"""
if rows is None:
return ""
if isinstance(rows, pd.DataFrame):
source = _dedupe_bibtex_dataframe(rows)
records = [row.to_dict() for _, row in source.iterrows()]
else:
records = []
for row in rows:
if hasattr(row, "to_dict"):
row = row.to_dict()
if hasattr(row, "_asdict"):
row = row._asdict()
if isinstance(row, dict):
records.append(row)
fields = list(allowed_fields or [])
raw_fields = set()
if include_hash and "hash" not in fields:
fields.append("hash")
if include_file:
if "file" not in fields:
fields.append("file")
raw_fields.add("file")
entries = []
for record in records:
record = dict(record)
if include_file and record.get("path"):
file_path = record["path"]
if path_resolver is not None:
file_path = path_resolver(file_path)
record["file"] = format_mendeley_file(file_path)
entry = dict_to_bibtex(record, allowed_fields=fields, raw_fields=raw_fields)
if entry:
entries.append(entry)
return "\n\n".join(entries)
def _dedupe_bibtex_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""Avoid duplicate BibTeX entries from exploded author/database rows."""
for column in ("tag", "hash"):
if column in df.columns:
present = df[column].notna() & (df[column].astype(str).str.strip() != "")
keyed = df[present].drop_duplicates(subset=[column], keep="first")
unkeyed = df[~present]
return pd.concat([keyed, unkeyed], ignore_index=True)
return df
[docs]
def bibtex_to_dict(bibtex_str: str) -> dict[str, dict[str, str]]:
"""
Very simple BibTeX parser for a single entry.
Returns {tag: {field: value, 'type': entry_type}}
"""
if not bibtex_str:
return {}
# Normalize
entry = bibtex_str.strip().replace('\r\n', '\n').replace('\r', '\n')
# Header: @type{tag,
header_match = re.match(r"\s*@?([a-zA-Z0-9\.\\\-_]+)\s*\{\s*([^,]+),", entry)
if not header_match:
return {}
entry_type, tag = header_match.groups()
result = {"type": entry_type}
# Body
body = entry[header_match.end():].strip()
if body.endswith('}'):
body = body[:-1]
# Fields: key = {value} or key = "value"
# This regex is simplified but covers most cases
# Added [a-zA-Z0-9\-_] to keys to be more robust
field_pattern = r"\s*([a-zA-Z0-9\-_]+)\s*=\s*[\{\"](.*?)[\}\"]\s*,?\s*(?:\n|\Z)"
for m in re.finditer(field_pattern, body, flags=re.DOTALL):
k, v = m.groups()
result[k.lower()] = v.strip()
return {tag: result}
[docs]
def dict_to_bibtex_crossref(data: Any) -> str:
"""
Converts a dict-like object to a BibTeX string.
Suitable for the return value form cross ref.
Gemini code.
"""
if data is None:
return ""
if hasattr(data, "to_dict"):
data = data.to_dict()
if hasattr(data, "_asdict"):
data = data._asdict()
if not isinstance(data, dict):
return ""
def get_list_safe(key: str) -> str:
val = data.get(key)
if isinstance(val, list) and val:
return str(val[0])
return str(val) if val else ""
ctype = data.get('type', 'misc')
type_map = {
'article': 'article',
'book': 'book',
'techreport': 'techreport',
'misc': 'misc',
'incollection': 'incollection',
'inproceedings': 'inproceedings',
'phdthesis': 'phdthesis',
'journal-article': 'article',
'book-chapter': 'incollection',
'proceedings-article': 'inproceedings',
'monograph': 'book',
'report': 'techreport',
'dissertation': 'phdthesis'
}
bib_type = type_map.get(ctype, 'misc')
authors = data.get('author', [])
formatted_authors = []
first_author_family = "Unknown"
if authors and isinstance(authors, list):
first_author_family = authors[0].get('family', 'Unknown')
for auth in authors:
family = auth.get('family')
given = auth.get('given')
if family and given:
formatted_authors.append(f"{family}, {given}")
elif family:
formatted_authors.append(family)
elif 'name' in auth:
formatted_authors.append(auth['name'])
author_str = " and ".join(formatted_authors)
date_parts = (
data.get('published-print', {}).get('date-parts') or
data.get('published-online', {}).get('date-parts') or
data.get('created', {}).get('date-parts')
)
year = str(date_parts[0][0]) if date_parts and date_parts[0] else "nd"
safe_family = "".join(filter(str.isalnum, first_author_family))
cite_key = f"{safe_family}{year}"
if (isbn := data.get("ISBN")):
isbn = isbn[0]
else:
isbn = None
fields = {
'author': author_str,
'title': get_list_safe('title'),
'journal': get_list_safe('container-title'),
'year': year,
'volume': data.get('volume'),
'number': data.get('issue'),
'pages': data.get('page'),
'doi': data.get('DOI'),
'publisher': data.get('publisher'),
'url': data.get('URL'),
'isbn': isbn,
}
active_fields = {k: v for k, v in fields.items() if v}
if not active_fields:
return ""
max_len = max(len(k) for k in active_fields)
lines = [f"@{bib_type}{{{cite_key},"]
for k, v in active_fields.items():
clean_val = str(v).replace('&', '\\&').replace('%', '\\%').replace('_', '\\_')
padding = " " * (max_len - len(k))
lines.append(f" {k}{padding} = {{{clean_val}}},")
lines.append("}")
return "\n".join(lines)