1455 lines
66 KiB
EmacsLisp
1455 lines
66 KiB
EmacsLisp
;;; parsebib.el --- A library for parsing bib files -*- lexical-binding: t -*-
|
|
|
|
;; Copyright (c) 2014-2025 Joost Kremers
|
|
;; All rights reserved.
|
|
|
|
;; Author: Joost Kremers <joostkremers@fastmail.fm>
|
|
;; Maintainer: Joost Kremers <joostkremers@fastmail.fm>
|
|
;; Created: 2014
|
|
;; Package-Version: 20250922.1100
|
|
;; Package-Revision: 4a9df6f1b4f3
|
|
;; Keywords: text bibtex
|
|
;; URL: https://github.com/joostkremers/parsebib
|
|
;; Package-Requires: ((emacs "25.1"))
|
|
|
|
;; Redistribution and use in source and binary forms, with or without
|
|
;; modification, are permitted provided that the following conditions
|
|
;; are met:
|
|
;;
|
|
;; 1. Redistributions of source code must retain the above copyright
|
|
;; notice, this list of conditions and the following disclaimer.
|
|
;; 2. Redistributions in binary form must reproduce the above copyright
|
|
;; notice, this list of conditions and the following disclaimer in the
|
|
;; documentation and/or other materials provided with the distribution.
|
|
;; 3. The name of the author may not be used to endorse or promote products
|
|
;; derived from this software without specific prior written permission.
|
|
;;
|
|
;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
;; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
;; OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
;; IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
;; INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
;; NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE,
|
|
;; DATA, OR PROFITS ; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
;; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
;; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
;; THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
;;; Commentary:
|
|
|
|
;; See the README for details.
|
|
;;
|
|
;; Acknowledgements:
|
|
;;
|
|
;; The code to clean up TeX markup in field values was contributed by Hugo
|
|
;; Heagren <hugo.heagren@kcl.ac.uk>; additional improvements were made by
|
|
;; <rahguzar@zohomail.eu>.
|
|
|
|
;;; Code:
|
|
|
|
(require 'cl-lib)
|
|
(eval-and-compile (unless (fboundp 'json-parse-buffer)
|
|
(require 'json)))
|
|
(defvar json-object-type)
|
|
|
|
(declare-function json-read "json.el")
|
|
|
|
(defvar bibtex-dialect)
|
|
(defvar bibtex-dialect-list)
|
|
|
|
(defvar parsebib-hashid-fields nil
|
|
"List of fields used to create a hash id for each entry.
|
|
Hash ids can only be created for BibTeX/biblatex files. The hash
|
|
id is stored in the entry in the special field `=hashid='.")
|
|
|
|
;; Regexes describing BibTeX identifiers and keys. Note that while $ ^ &
|
|
;; are valid in BibTeX keys, they may nonetheless be problematic, because
|
|
;; they are special for TeX. Which characters are allowed in keys and
|
|
;; identifiers differs depending on context. The StackExchange answer at
|
|
;; https://tex.stackexchange.com/questions/96454/using-bibtex-keys-containing-parentheses-with-biber
|
|
;; says that Biber uses a library for parsing .bib files (btparse) that
|
|
;; disallows the following characters in keys;
|
|
;;
|
|
;; " # % ' ( ) , = { }
|
|
;;
|
|
;; Note that parentheses are allowed by BibTeX, though, so I won't exclude
|
|
;; them here.
|
|
;;
|
|
;; Also, keys and identifiers (field and @String names) are distinguished,
|
|
;; though I'm not sure that is correct (or even desirable). I'll keep it
|
|
;; that way until someone complains, though.
|
|
|
|
(defconst parsebib--bibtex-identifier "[^\"@\\#%',={}() \t\n\f]+" "Regexp describing a licit BibTeX identifier.")
|
|
(defconst parsebib--bibtex-key-regexp "[^\"#%',={} \t\n\f]+" "Regexp describing a licit BibTeX key.")
|
|
(defconst parsebib--bibtex-entry-start "^[ \t]*@" "Regexp describing the start of an entry.")
|
|
|
|
(defvar parsebib-postprocessing-excluded-fields '("file" "url" "doi")
|
|
"List of fields that should not be post-processed.")
|
|
|
|
;; Cleaning up TeX code is very slow, so we restrict it to those fields for
|
|
;; which it makes sense.
|
|
(defvar parsebib-replace-TeX-fields '("author" "editor" "title")
|
|
"List of fields in which TeX code should be cleaned up.")
|
|
|
|
(defvar parsebib--biblatex-inheritances '(;; Source Target
|
|
("all" "all"
|
|
(("ids" . none)
|
|
("crossref" . none)
|
|
("xref" . none)
|
|
("entryset" . none)
|
|
("entrysubtype" . none)
|
|
("execute" . none)
|
|
("label" . none)
|
|
("options" . none)
|
|
("presort" . none)
|
|
("related" . none)
|
|
("relatedoptions" . none)
|
|
("relatedstring" . none)
|
|
("relatedtype" . none)
|
|
("shorthand" . none)
|
|
("shorthandintro" . none)
|
|
("sortkey" . none)))
|
|
|
|
;; Source Target
|
|
("mvbook, book" "inbook, bookinbook, suppbook"
|
|
(("author" . "author")
|
|
("author" . "bookauthor")))
|
|
|
|
;; Source Target
|
|
("mvbook" "book, inbook, bookinbook, suppbook"
|
|
(("title" . "maintitle")
|
|
("subtitle" . "mainsubtitle")
|
|
("titleaddon" . "maintitleaddon")
|
|
("shorttitle" . none)
|
|
("sorttitle" . none)
|
|
("indextitle" . none)
|
|
("indexsorttitle" . none)))
|
|
|
|
;; Source Target
|
|
("mvcollection, mvreference" "collection, reference, incollection, inreference, suppcollection"
|
|
(("title" . "maintitle")
|
|
("subtitle" . "mainsubtitle")
|
|
("titleaddon" . "maintitleaddon")
|
|
("shorttitle" . none)
|
|
("sorttitle" . none)
|
|
("indextitle" . none)
|
|
("indexsorttitle" . none)))
|
|
|
|
;; Source Target
|
|
("mvproceedings" "proceedings, inproceedings"
|
|
(("title" . "maintitle")
|
|
("subtitle" . "mainsubtitle")
|
|
("titleaddon" . "maintitleaddon")
|
|
("shorttitle" . none)
|
|
("sorttitle" . none)
|
|
("indextitle" . none)
|
|
("indexsorttitle" . none)))
|
|
|
|
;; Source Target
|
|
("book" "inbook, bookinbook, suppbook"
|
|
(("title" . "booktitle")
|
|
("subtitle" . "booksubtitle")
|
|
("titleaddon" . "booktitleaddon")
|
|
("shorttitle" . none)
|
|
("sorttitle" . none)
|
|
("indextitle" . none)
|
|
("indexsorttitle" . none)))
|
|
|
|
;; Source Target
|
|
("collection, reference" "incollection, inreference, suppcollection"
|
|
(("title" . "booktitle")
|
|
("subtitle" . "booksubtitle")
|
|
("titleaddon" . "booktitleaddon")
|
|
("shorttitle" . none)
|
|
("sorttitle" . none)
|
|
("indextitle" . none)
|
|
("indexsorttitle" . none)))
|
|
|
|
;; Source Target
|
|
("proceedings" "inproceedings"
|
|
(("title" . "booktitle")
|
|
("subtitle" . "booksubtitle")
|
|
("titleaddon" . "booktitleaddon")
|
|
("shorttitle" . none)
|
|
("sorttitle" . none)
|
|
("indextitle" . none)
|
|
("indexsorttitle" . none)))
|
|
|
|
;; Source Target
|
|
("periodical" "article, suppperiodical"
|
|
(("title" . "journaltitle")
|
|
("subtitle" . "journalsubtitle")
|
|
("shorttitle" . none)
|
|
("sorttitle" . none)
|
|
("indextitle" . none)
|
|
("indexsorttitle" . none))))
|
|
|
|
"Inheritance scheme for BibLaTeX cross-referencing.
|
|
Inheritances are specified for pairs of source and target entry
|
|
type, where the target is the cross-referencing entry and the
|
|
source the cross-referenced entry. Each pair specifies the
|
|
fields in the source and the fields in the target that they
|
|
correspond with.
|
|
|
|
Inheritances valid for all entry types are defined by specifying
|
|
the entry type as \"all\". The entry type may also be a
|
|
comma-separated list of entry types.
|
|
|
|
If no inheritance rule is set up for a given entry type+field
|
|
combination, the field inherits from the same-name field in the
|
|
cross-referenced entry. If no inheritance should take place, the
|
|
target field is set to the symbol `none'.")
|
|
|
|
;;;; BibTeX / biblatex parser
|
|
|
|
;;; Parser primitives
|
|
;;
|
|
;; The parser is divided into a set of primitives, which do the actual
|
|
;; reading, and a set of grammar rules, which describe the syntax of a
|
|
;; BibTeX file.
|
|
;;
|
|
;; A few things to keep in mind:
|
|
;;
|
|
;; - The primitives are BibTeX-agnostic. They read specific chunks of the
|
|
;; source and return them.
|
|
;;
|
|
;; - The primitives can be parameterised; that is, the exact text that they
|
|
;; read may depend on arguments passed to them. The grammar rules do not
|
|
;; have any parameters.
|
|
;;
|
|
;; - The primitives are responsible for skipping whitespace.
|
|
|
|
(define-error 'parsebib-error "[Parsebib error]" 'error)
|
|
|
|
(defun parsebib--skip-whitespace ()
|
|
"Skip whitespace."
|
|
(skip-chars-forward " \n\r\t\f\v"))
|
|
|
|
(defun parsebib--char (chars &optional noerror)
|
|
"Read the character at point.
|
|
CHARS is a list of characters. If the character at point matches
|
|
a character in CHARS, return it and move point, otherwise signal
|
|
an error, unless NOERROR is non-nil, in which case return nil."
|
|
(parsebib--skip-whitespace)
|
|
(if (memq (char-after) chars)
|
|
(prog1
|
|
(char-after)
|
|
(forward-char 1))
|
|
(unless noerror
|
|
(signal 'parsebib-error (list (point)
|
|
"Invalid character `%c'"
|
|
(following-char))))))
|
|
|
|
(defun parsebib--keyword (keywords &optional noerror)
|
|
"Read the keyword following point.
|
|
KEYWORDS is a list of allowed keywords. If the text following
|
|
point matches one of KEYWORDS (case-insensitively), return it and
|
|
move point. Otherwise signal an error, unless NOERROR is
|
|
non-nil, in which case return nil."
|
|
(parsebib--skip-whitespace)
|
|
(let ((case-fold-search t))
|
|
(if (looking-at (regexp-opt keywords))
|
|
(let ((keyword (match-string-no-properties 0)))
|
|
(progn
|
|
(goto-char (match-end 0))
|
|
keyword))
|
|
(unless noerror
|
|
(signal 'parsebib-error (list (point)
|
|
"Invalid keyword %s"
|
|
keywords
|
|
(char-after)))))))
|
|
|
|
(defun parsebib--symbol (regexp &optional noerror)
|
|
"Read a symbol and return it.
|
|
REGEXP is a regular expression describing a licit symbol. If a
|
|
symbol is found, return it. Otherwise signal an error, unless
|
|
NOERROR is non-nil, in which case return nil."
|
|
(parsebib--skip-whitespace)
|
|
(if (looking-at regexp)
|
|
(progn
|
|
(goto-char (match-end 0))
|
|
(match-string-no-properties 0))
|
|
(unless noerror
|
|
(signal 'parsebib-error (list (point) "Illegal identifier")))))
|
|
|
|
(defun parsebib--seq-delim (open close esc)
|
|
"Read a delimited sequence.
|
|
A delimited sequence is a sequence delimited by OPEN and CLOSE
|
|
characters, which must be different (e.g., any kind of
|
|
parentheses). ESC is an escape character that can be used to
|
|
escape OPEN and CLOSE inside the sequence. OPEN and CLOSE can
|
|
appear in the sequence unescaped as long as they are
|
|
balanced. (In other words, the sequence can contain nested
|
|
sequences)."
|
|
(parsebib--skip-whitespace)
|
|
(let ((beg (point))
|
|
(n-braces 1)
|
|
(skip-chars (format "^%c%c" open close)))
|
|
(parsebib--char (list open))
|
|
(while (and (> n-braces 0)
|
|
(not (eobp)))
|
|
(skip-chars-forward skip-chars)
|
|
(cond
|
|
((eq (char-after) open)
|
|
(unless (eq (char-before) esc)
|
|
(setq n-braces (1+ n-braces))))
|
|
((eq (char-after) close)
|
|
(unless (eq (char-before) esc)
|
|
(setq n-braces (1- n-braces)))))
|
|
(ignore-error end-of-buffer (forward-char 1)))
|
|
(if (= n-braces 0)
|
|
(buffer-substring-no-properties beg (point))
|
|
(signal 'parsebib-error (list beg
|
|
"Opening %c has no closing %c"
|
|
open
|
|
close)))))
|
|
|
|
(defun parsebib--string (delim esc)
|
|
"Read a string delimited by DELIM.
|
|
A string is a delimited sequence where the opening and closing
|
|
delimiters are identical, e.g., \"...\". ESC is the escape
|
|
character."
|
|
(parsebib--skip-whitespace)
|
|
(let ((beg (point))
|
|
(continue t)
|
|
(skip-chars (format "^%c" delim)))
|
|
(parsebib--char (list delim))
|
|
(while (and continue
|
|
(not (eobp)))
|
|
(skip-chars-forward skip-chars)
|
|
(unless (eq (char-before) esc)
|
|
(setq continue nil))
|
|
(forward-char 1))
|
|
(if (not continue)
|
|
(buffer-substring-no-properties beg (point))
|
|
(signal 'parsebib-error (list beg
|
|
"Opening %c has no closing %c"
|
|
delim
|
|
delim)))))
|
|
|
|
(defun parsebib--comment-line ()
|
|
"Read a single-line comment and return it."
|
|
(prog1 (buffer-substring-no-properties (point) (pos-eol))
|
|
(forward-line 1)))
|
|
|
|
(defun parsebib--match (rules &optional noerror)
|
|
"Check if a rule in RULES matches at point.
|
|
Apply the first rule that matches and return the result. If no
|
|
rule matches, signal an error, unless NOERROR is non-nil, in
|
|
which case return nil.
|
|
|
|
RULES is a list of symbols, each naming a parsing rule."
|
|
(parsebib--skip-whitespace)
|
|
(let ((start-pos (point))
|
|
last-error)
|
|
(catch 'success
|
|
(dolist (rule rules)
|
|
(condition-case err
|
|
(let ((res (funcall rule)))
|
|
(throw 'success res))
|
|
(parsebib-error
|
|
(goto-char start-pos)
|
|
(setq last-error err))))
|
|
(unless noerror
|
|
(signal (car last-error) (cdr last-error))))))
|
|
|
|
;;; Parser rules
|
|
|
|
;; Basic building blocks
|
|
|
|
(defun parsebib--text ()
|
|
"Parse text.
|
|
Text is anything that is between braces or double quotes that
|
|
should be read literally."
|
|
(parsebib--match '(parsebib--braced-text
|
|
parsebib--quoted-text)))
|
|
|
|
(defun parsebib--braced-text ()
|
|
"Parse text in curly braces."
|
|
(parsebib--seq-delim ?\{ ?\} ?\\))
|
|
|
|
(defun parsebib--quoted-text ()
|
|
"Parse text in double quotes."
|
|
(parsebib--string ?\" ?\\))
|
|
|
|
(defun parsebib--key ()
|
|
"Parse a BibTeX key."
|
|
(parsebib--symbol parsebib--bibtex-key-regexp))
|
|
|
|
(defun parsebib--identifier ()
|
|
"Parse a BibTeX identifier."
|
|
(parsebib--symbol parsebib--bibtex-identifier))
|
|
|
|
(defun parsebib--value ()
|
|
"Parse a BibTeX value.
|
|
A value is one component of a composed value (see
|
|
`parsebib--composed-value') and can either be a piece of quoted
|
|
text (i.e., text in double quotes or braces) or a @String
|
|
abbreviation."
|
|
(or (parsebib--match '(parsebib--text
|
|
parsebib--identifier)
|
|
:noerror)
|
|
(signal 'parsebib-error (list (point) "Expected {, \" or identifier"))))
|
|
|
|
(defun parsebib--composed-value ()
|
|
"Parse a BibTeX composed field value.
|
|
A composed value consists of one or more values concatenated
|
|
using the character `#'. They typically appear after an equal
|
|
sign as field values and in @String definitions as expansions."
|
|
(let ((val (list (parsebib--value))))
|
|
(while (and (parsebib--char '(?#) :noerror)
|
|
(not (eobp)))
|
|
(push (parsebib--value) val))
|
|
(nreverse val)))
|
|
|
|
(defun parsebib--assignment ()
|
|
"Parse a BibTeX assignment.
|
|
An assignment is the combination of an identifier, an equal sign
|
|
and a composed value. A @String definition has exactly one
|
|
assignment, an entry has a potentially unlimited number."
|
|
(if-let* ((id (parsebib--identifier))
|
|
(_ (parsebib--char '(?=)))
|
|
(val (parsebib--composed-value)))
|
|
(cons id val)
|
|
(signal 'parsebib-error (list (point) "Malformed key=value assignment"))))
|
|
|
|
(defun parsebib--fields ()
|
|
"Parse a set of BibTeX assignments.
|
|
A set of assignments makes up the body of an entry."
|
|
(let ((fields (list (parsebib--assignment))))
|
|
(while (and (parsebib--char '(?,) :noerror)
|
|
(not (eobp)))
|
|
;; There may be a comma after the final field of an entry. If that
|
|
;; happens, reading another assignment will fail, so we capture the
|
|
;; error here.
|
|
(ignore-error parsebib-error
|
|
(push (parsebib--assignment) fields)))
|
|
fields))
|
|
|
|
;; BibTeX items
|
|
|
|
(defun parsebib--@comment ()
|
|
"Parse a @Comment.
|
|
Return the contents of the @Comment as a string."
|
|
(parsebib--char '(?@))
|
|
(parsebib--keyword '("comment"))
|
|
(or (parsebib--match '(parsebib--text
|
|
parsebib--comment-line)
|
|
:noerror)
|
|
(signal 'parsebib-error (list (point) "Malformed @Comment"))))
|
|
|
|
(defun parsebib--@preamble ()
|
|
"Parse a @Preamble.
|
|
Return the contents of the @Preamble as a string."
|
|
(parsebib--char '(?@))
|
|
(parsebib--keyword '("preamble"))
|
|
(or (parsebib--text)
|
|
(signal 'parsebib-error (list (point) "Malformed @Preamble"))))
|
|
|
|
(defun parsebib--@string ()
|
|
"Parse an @String definition.
|
|
Return the definition as a cons cell of the abbreviation and a
|
|
composed value as a list."
|
|
(if-let* ((_ (parsebib--char '(?@)))
|
|
(_ (parsebib--keyword '("string")))
|
|
(open (parsebib--char '(?\{ ?\( )))
|
|
(definition (parsebib--assignment))
|
|
(_ (parsebib--char (alist-get open '((?\{ ?\}) (?\( ?\)))))))
|
|
definition
|
|
(signal 'parsebib-error (list (point) "Malformed @String definition"))))
|
|
|
|
(defun parsebib--@entry ()
|
|
"Parse a BibTeX database entry.
|
|
Return the entry as an alist of <field . value> pairs, where
|
|
<field> is a string and <value> is a list of strings."
|
|
(if-let* ((_ (parsebib--char '(?@)))
|
|
(type (parsebib--identifier))
|
|
(open (parsebib--char '(?\{ ?\( )))
|
|
(key (parsebib--key))
|
|
(_ (parsebib--char '(?,)))
|
|
(fields (parsebib--fields))
|
|
(_ (parsebib--char (alist-get open '((?\{ ?\}) (?\( ?\)))))))
|
|
(progn (push (cons "=type=" (list type)) fields)
|
|
(push (cons "=key=" (list key)) fields)
|
|
fields)
|
|
(signal 'parsebib-error (list (point) "Malformed entry definition"))))
|
|
|
|
;;;; Low-level BibTeX/biblatex API
|
|
|
|
(defun parsebib-find-next-item ()
|
|
"Find the first (potential) BibTeX item following point.
|
|
This function simply searches for an @ at the start of a line,
|
|
possibly preceded by spaces or tabs, followed by a string of
|
|
characters as defined by `parsebib--bibtex-identifier'.
|
|
|
|
If an item is found, position point at the start of the line and
|
|
return the name of the item as a string, either \"Comment\",
|
|
\"Preamble\" or \"String\", or the entry type (without the @).
|
|
If no item is found, move point to the end of the buffer."
|
|
(when (re-search-forward parsebib--bibtex-entry-start nil 0)
|
|
(if (looking-at (concat "\\(" parsebib--bibtex-identifier "\\)" "[[:space:]]*[\(\{]?"))
|
|
(prog1
|
|
(match-string-no-properties 1)
|
|
(goto-char (pos-bol)))
|
|
(signal 'parsebib-error (list (point) "Search for BibTeX entry failed")))))
|
|
|
|
(defun parsebib--get-hashid-string (fields)
|
|
"Create a string from the contents of FIELDS to compute a hash id."
|
|
(cl-loop for field in parsebib-hashid-fields
|
|
collect (or
|
|
;; Remove braces {}.
|
|
(replace-regexp-in-string "^{\\|}$" "" (cdr (assoc-string field fields 'case-fold)))
|
|
"")
|
|
into hashid-fields
|
|
finally return (mapconcat #'identity hashid-fields "")))
|
|
|
|
(defun parsebib-read-entry (&optional fields strings replace-TeX)
|
|
"Read a BibTeX entry starting at point.
|
|
Point should be positioned before the `@'-character that starts
|
|
the entry, with possibly whitespace intervening. Return an alist
|
|
of (<field> . <contents>) conses, or nil if no entry was found.
|
|
The returned alist provides the entry key in the field \"=key=\"
|
|
and the entry type in the field \"=type=\".
|
|
|
|
If `parsebib-hashid-fields' is non-nil, a hash ID is added in the
|
|
field \"=hashid=\". The hash is computed on the basis of the
|
|
contents of the fields listed in `parsebib-hashid-fields' using
|
|
the function `secure-hash' and the `sha256' algorithm.
|
|
|
|
FIELDS is a list of the field names (as strings) to be read and
|
|
included in the result. Fields not in the list are ignored.
|
|
Case is ignored when comparing fields to the list in FIELDS. If
|
|
FIELDS is nil, all fields are returned. Note that if FIELDS is
|
|
non-nil, it should contain the field names \"=key=\" and
|
|
\"=type=\".
|
|
|
|
STRINGS and REPLACE-TEX are used to post-process field values.
|
|
See the function `parsebib--post-process' for details."
|
|
(let ((entry (parsebib--@entry)))
|
|
(when fields
|
|
(setq entry (seq-filter (lambda (field)
|
|
(member-ignore-case (car field) fields))
|
|
entry)))
|
|
(setq entry (mapcar (lambda (field)
|
|
(parsebib--post-process field strings replace-TeX))
|
|
entry))
|
|
(when parsebib-hashid-fields
|
|
(push (cons "=hashid=" (secure-hash 'sha256 (parsebib--get-hashid-string fields))) entry))
|
|
entry))
|
|
|
|
(defun parsebib-read-string (&optional strings)
|
|
"Read the @String definition beginning at point.
|
|
Return the definition as a cons cell (<abbrev> . <expansion>).
|
|
|
|
If STRINGS is provided, it should be a hash table with @String
|
|
abbreviations, which are used to expand abbreviations in the
|
|
string's expansion."
|
|
(let* ((definition (parsebib--@string))
|
|
(abbrev (car definition))
|
|
(expansion (cdr definition)))
|
|
(setq expansion (if strings
|
|
(string-join (parsebib--post-process-strings expansion strings t))
|
|
(string-join expansion " # ")))
|
|
(cons abbrev expansion)))
|
|
|
|
(defalias 'parsebib-read-preamble 'parsebib--@preamble)
|
|
(defalias 'parsebib-read-comment 'parsebib--@comment)
|
|
|
|
;;;;; Post-processing stuff
|
|
|
|
(defun parsebib--post-process (field strings replace-TeX)
|
|
"Post-process FIELD.
|
|
FIELD is a cons cell consisting of the field name and the field
|
|
value. The field value is a list of strings.
|
|
|
|
If STRINGS is provided, it should be a hash table with string
|
|
definitions. @String abbreviations in field values are then
|
|
expanded using these definitions. In addition, field values are
|
|
unquoted, newlines are removed and sequences of whitespace are
|
|
collapsed into a single space.
|
|
|
|
If REPLACE-TEX is non-nil, TeX markup is cleaned up. See the
|
|
variable `parsebib-TeX-markup-replace-alist' for details.
|
|
|
|
No post-processing is applied to fields listed in
|
|
`parsebib-postprocessing-excluded-fields', with the exception of
|
|
unquoting, which is always applied if STRINGS is non-nil.
|
|
|
|
Finally, the strings in the field value are concatenated. Return
|
|
value is a cons cell of field name and field value, the value now
|
|
being a single string."
|
|
(let* ((name (car field))
|
|
(value (cdr field))
|
|
(post-process (not (member-ignore-case name parsebib-postprocessing-excluded-fields)))
|
|
(replace-TeX (and replace-TeX (member-ignore-case name parsebib-replace-TeX-fields))))
|
|
(setq value (if strings
|
|
(string-join (parsebib--post-process-strings value strings post-process))
|
|
(string-join value " # ")))
|
|
(when (and replace-TeX post-process)
|
|
(setq value (parsebib-clean-TeX-markup value)))
|
|
(cons name value)))
|
|
|
|
(defun parsebib--post-process-strings (strings abbrevs post-process)
|
|
"Post-process the strings in STRINGS.
|
|
STRINGS is a list of strings, ABBREVS a hash table with @String
|
|
definitions. Post-processing involves three changes: First,
|
|
sequences of whitespace are collapsed into a single space.
|
|
Second, if a string has an expansion in ABBREVS, it is replaced
|
|
with the expansion. Both these changes are only applied if
|
|
POST-PROCESS is non-nil. Lastly, if the string is enclosed in
|
|
braces {} or double -quotes \"\", these are removed."
|
|
(mapcar (lambda (str)
|
|
(when post-process
|
|
(setq str (replace-regexp-in-string "[[:space:]\t\n\f]+" " " str)))
|
|
(cond
|
|
((and post-process
|
|
(gethash str abbrevs)))
|
|
((string-match "\\`[\"{]\\(.*?\\)[\"}]\\'" str)
|
|
(match-string 1 str))
|
|
(t str)))
|
|
strings))
|
|
|
|
(defun parsebib-expand-xrefs (entries inheritance)
|
|
"Expand cross-referencing items in ENTRIES.
|
|
BibTeX entries in ENTRIES that have a `crossref' field are
|
|
expanded with the fields in the cross-referenced entry. ENTRIES
|
|
is a hash table with entries. This hash table is updated with
|
|
the new fields. The return value of this function is always nil.
|
|
|
|
INHERITANCE indicates the inheritance schema. It can be a symbol
|
|
`BibTeX' or `biblatex', or it can be an explicit inheritance
|
|
schema. See the variable `parsebib--biblatex-inheritances' for
|
|
details on the structure of such an inheritance schema."
|
|
(maphash (lambda (key fields)
|
|
(let ((xref (cdr (assoc-string "crossref" fields))))
|
|
(when xref
|
|
(if (string-match-p (concat "\\b[\"{]" parsebib--bibtex-key-regexp "[\"}]\\b") xref)
|
|
(setq xref (substring xref 1 -1)))
|
|
(let* ((source (gethash xref entries))
|
|
(updated-entry (parsebib--get-xref-fields fields source inheritance)))
|
|
(when updated-entry
|
|
(puthash key updated-entry entries))))))
|
|
entries))
|
|
|
|
(defun parsebib--get-xref-fields (target-entry source-entry inheritance)
|
|
"Return TARGET-ENTRY supplemented with fields inherited from SOURCE-ENTRY.
|
|
TARGET-ENTRY and SOURCE-ENTRY are entry alists. Fields in
|
|
SOURCE-ENTRY for which TARGET-ENTRY has no value are added to
|
|
TARGET-ENTRY. Return value is the modified TARGET-ENTRY.
|
|
|
|
INHERITANCE is an inheritance schema. It can either be one of
|
|
the symbols `BibTeX' or `biblatex', or it can be an explicit
|
|
inheritance schema. See the variable
|
|
`parsebib--biblatex-inheritances' for details on the structure of
|
|
such an inheritance schema."
|
|
(when (and target-entry source-entry)
|
|
(when (eq inheritance 'biblatex)
|
|
(setq inheritance parsebib--biblatex-inheritances))
|
|
(let* ((source-type (concat "\\b" (cdr (assoc-string "=type=" source-entry)) "\\b"))
|
|
(target-type (concat "\\b" (cdr (assoc-string "=type=" target-entry)) "\\b"))
|
|
(for-all-types (nth 2 (assoc-string "all" inheritance)))
|
|
(inheritable-fields
|
|
(unless (eq inheritance 'BibTeX)
|
|
(append
|
|
(apply #'append (mapcar #'cl-third
|
|
(cl-remove-if-not
|
|
(lambda (elem)
|
|
(and (string-match-p source-type (nth 0 elem))
|
|
(string-match-p target-type (nth 1 elem))))
|
|
inheritance)))
|
|
for-all-types)))
|
|
(new-fields (mapcan (lambda (field)
|
|
(let ((target-field (parsebib--get-target-field (car field) inheritable-fields)))
|
|
(if (and target-field
|
|
(not (assoc-string target-field target-entry 'case-fold)))
|
|
(list (cons target-field (cdr field))))))
|
|
source-entry)))
|
|
(append target-entry new-fields))))
|
|
|
|
(defun parsebib--get-target-field (source-field inheritances)
|
|
"Return the target field for inheritance from SOURCE-FIELD.
|
|
Inheritance is determined by INHERITANCES, which is an alist of
|
|
source/target pairs. If no inheritance should take place for
|
|
SOURCE-FIELD, the target in the relevant item in INHERITANCES is
|
|
the symbol `none'. If there is no item for SOURCE-FIELD in
|
|
INHERITANCES, SOURCE-FIELD is returned. Note that it is valid
|
|
for INHERITANCES to be nil."
|
|
;; Note: the argument INHERITANCES differs from the INHERITANCE argument in
|
|
;; the previous two functions. It is a simple alist of (source-field
|
|
;; . target-field) pairs.
|
|
(let ((target-field (cdr (assoc-string source-field inheritances 'case-fold))))
|
|
(cond
|
|
((null target-field)
|
|
source-field)
|
|
((eq target-field 'none)
|
|
nil)
|
|
(t target-field))))
|
|
|
|
;;;; Clean up TeX markup
|
|
|
|
(defvar parsebib-TeX-cleanup-target 'display
|
|
"Target for `parsebib-clean-TeX-markup'.
|
|
This variable affects the output of the functions that convert
|
|
LaTeX font commands \\textbf, \\textit, and \\emph. Its value
|
|
should be one of the symbols `display', `markdown' `org' or
|
|
`plain'. Any other value is treated as a synonym for `plain'.
|
|
See `parsebib--convert-tex-italics' and
|
|
`parsebib--convert-tex-bold' for details.")
|
|
|
|
(defun parsebib--convert-tex-italics (str)
|
|
"Return STR converted to italic face.
|
|
Depending on the value of `parsebib-TeX-cleanup-target', add a
|
|
face property `italic' to STR, or return it with Markdown or Org
|
|
markup for italic text."
|
|
(pcase parsebib-TeX-cleanup-target
|
|
('display (propertize str 'face 'italic))
|
|
('markdown (concat "*" str "*"))
|
|
('org (concat "/" str "/"))
|
|
(_ str)))
|
|
|
|
(defun parsebib--convert-tex-bold (str)
|
|
"Return STR converted to bold face.
|
|
Depending on the value of `parsebib-TeX-cleanup-target', add a
|
|
face property `bold' to STR, or return it with Markdown or Org
|
|
markup for bold text."
|
|
(pcase parsebib-TeX-cleanup-target
|
|
('display (propertize str 'face 'bold))
|
|
('markdown (concat "**" str "**"))
|
|
('org (concat "*" str "*"))
|
|
(_ str)))
|
|
|
|
(defvar parsebib-TeX-command-replacement-alist
|
|
'(("ddag" . "\N{DOUBLE DAGGER}")
|
|
("textdaggerdbl" . "\N{DOUBLE DAGGER}")
|
|
("dag" . "\N{DAGGER}")
|
|
("textdagger" . "\N{DAGGER}")
|
|
("textpertenthousand" . "\N{PER TEN THOUSAND SIGN}")
|
|
("textperthousand" . "\N{PER MILLE SIGN}")
|
|
("textquestiondown" . "\N{INVERTED QUESTION MARK}")
|
|
("P" . "\N{PILCROW SIGN}")
|
|
("textdollar" . "$")
|
|
("S" . "\N{SECTION SIGN}")
|
|
("ldots" . "\N{HORIZONTAL ELLIPSIS}")
|
|
("dots" . "\N{HORIZONTAL ELLIPSIS}")
|
|
("textellipsis" . "\N{HORIZONTAL ELLIPSIS}")
|
|
("textemdash" . "\N{EM DASH}")
|
|
("textendash" . "\N{EN DASH}")
|
|
("textbar" . "|")
|
|
|
|
;; Non-ASCII Letters (Excluding Accented Letters)
|
|
("AA" . "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}")
|
|
("AE" . "\N{LATIN CAPITAL LETTER AE}")
|
|
("DH" . "\N{LATIN CAPITAL LETTER ETH}")
|
|
("DJ" . "\N{LATIN CAPITAL LETTER ETH}")
|
|
("L" . "\N{LATIN CAPITAL LETTER L WITH STROKE}")
|
|
("SS" . "\N{LATIN CAPITAL LETTER SHARP S}")
|
|
("NG" . "\N{LATIN CAPITAL LETTER ENG}")
|
|
("OE" . "\N{LATIN CAPITAL LIGATURE OE}")
|
|
("O" . "\N{LATIN CAPITAL LETTER O WITH STROKE}")
|
|
("TH" . "\N{LATIN CAPITAL LETTER THORN}")
|
|
|
|
("aa" . "\N{LATIN SMALL LETTER A WITH RING ABOVE}")
|
|
("ae" . "\N{LATIN SMALL LETTER AE}")
|
|
("dh" . "\N{LATIN SMALL LETTER ETH}")
|
|
("dj" . "\N{LATIN SMALL LETTER ETH}")
|
|
("l" . "\N{LATIN SMALL LETTER L WITH STROKE}")
|
|
("ss" . "\N{LATIN SMALL LETTER SHARP S}")
|
|
("ng" . "\N{LATIN SMALL LETTER ENG}")
|
|
("oe" . "\N{LATIN SMALL LIGATURE OE}")
|
|
("o" . "\N{LATIN SMALL LETTER O WITH STROKE}")
|
|
("th" . "\N{LATIN SMALL LETTER THORN}")
|
|
|
|
("ij" . "ij")
|
|
("i" . "\N{LATIN SMALL LETTER DOTLESS I}")
|
|
("j" . "\N{LATIN SMALL LETTER DOTLESS J}")
|
|
|
|
;; Formatting Commands
|
|
("textit" . parsebib--convert-tex-italics)
|
|
("emph" . parsebib--convert-tex-italics)
|
|
("textbf" . parsebib--convert-tex-bold)
|
|
("textsc" . upcase))
|
|
"An alist of <command>-<replacement> pairs for LaTeX commands.
|
|
<command> is the name of a TeX or LaTeX command (without
|
|
backslash), <replacement> is the string with which it is
|
|
replaced.
|
|
|
|
<replacement> can also be a function of one argument. In this
|
|
case, <command> must take at least one obligatory argument, which
|
|
is passed as the first argument of the replacement function. The
|
|
return value of this function is used as the replacement string
|
|
for <command>.
|
|
|
|
See `parsebib-TeX-markup-replacement-alist' and the function
|
|
`parsebib-clean-TeX-markup' to see how this variable is used.")
|
|
|
|
(defvar parsebib-TeX-accent-replacement-alist
|
|
'(("\"" . "\N{COMBINING DIAERESIS}")
|
|
("'" . "\N{COMBINING ACUTE ACCENT}")
|
|
("." . "\N{COMBINING DOT ABOVE}")
|
|
("=" . "\N{COMBINING MACRON}")
|
|
("^" . "\N{COMBINING CIRCUMFLEX ACCENT}")
|
|
("`" . "\N{COMBINING GRAVE ACCENT}")
|
|
("b" . "\N{COMBINING MACRON BELOW}")
|
|
("c" . "\N{COMBINING CEDILLA}")
|
|
("d" . "\N{COMBINING DOT BELOW}")
|
|
("H" . "\N{COMBINING DOUBLE ACUTE ACCENT}")
|
|
("k" . "\N{COMBINING OGONEK}")
|
|
("U" . "\N{COMBINING DOUBLE VERTICAL LINE ABOVE}")
|
|
("u" . "\N{COMBINING BREVE}")
|
|
("v" . "\N{COMBINING CARON}")
|
|
("~" . "\N{COMBINING TILDE}")
|
|
("|" . "\N{COMBINING COMMA ABOVE}")
|
|
("f" . "\N{COMBINING INVERTED BREVE}")
|
|
("G" . "\N{COMBINING DOUBLE GRAVE ACCENT}")
|
|
("h" . "\N{COMBINING HOOK ABOVE}")
|
|
("C" . "\N{COMBINING DOUBLE GRAVE ACCENT}")
|
|
("r" . "\N{COMBINING RING ABOVE}") )
|
|
"Alist of <command>-<accent> pairs for LaTeX diacritics.
|
|
<command> is the name of a TeX or LaTeX command (without
|
|
backslash), <accent> is the Unicode combining character for the
|
|
diacritic that <command> generates. Both <command> and <accent>
|
|
must be strings.
|
|
|
|
The replacement string for <command> is composed of its
|
|
obligatory argument (usually a single character) and the
|
|
combining diacritic.
|
|
|
|
See `parsebib-TeX-markup-replacement-alist' and the function
|
|
`parsebib-clean-TeX-markup' to see how this variable is used.")
|
|
|
|
(defvar parsebib-TeX-literal-replacement-alist
|
|
;; LaTeX2 Escapable "Special" Characters
|
|
`(("\\%" . "%") ("\\&" . "&") ("\\#" . "#") ("\\$" . "$")
|
|
;; Quotes
|
|
("``" . "\N{LEFT DOUBLE QUOTATION MARK}")
|
|
("`" . "\N{LEFT SINGLE QUOTATION MARK}")
|
|
("''" . "\N{RIGHT DOUBLE QUOTATION MARK}")
|
|
("'" . "\N{RIGHT SINGLE QUOTATION MARK}")
|
|
;; Dashes
|
|
("---" . "\N{EM DASH}")
|
|
("--" . "\N{EN DASH}")
|
|
;; Remove all remaining {braces}
|
|
("{" . "") ("}" . ""))
|
|
"Alist of <literal>-<replacement> pairs. Both are strings.
|
|
This variable contains characters that are special in LaTeX and
|
|
single-character, non-ASCII LaTeX commands.
|
|
|
|
Note that adding pairs to this variable has no effect unless
|
|
`parsebib-TeX-markup-replacement-alist' is adjusted accordingly.
|
|
For example, after adding a <literal>-<replacement> pair, the
|
|
following code will ensure that <literal> gets replaced with
|
|
<replacement>.
|
|
|
|
(cl-callf (lambda (regex) (rx (or <literal> (regexp regex))))
|
|
(alist-get (quote parsebib--TeX-replace-literal)
|
|
parsebib-TeX-markup-replacement-alist))
|
|
|
|
See `parsebib-TeX-markup-replacement-alist' and the function
|
|
`parsebib-clean-TeX-markup' to see how this variable is used.")
|
|
|
|
(defvar parsebib-TeX-markup-replacement-alist
|
|
`((parsebib--TeX-replace-command-or-accent
|
|
;; This regexp matches any latex command, i.e., anything that starts
|
|
;; with a backslash. The name of the command, which is either a string
|
|
;; of alphabetic characters or a single non-alphabetic character, is
|
|
;; captured by group 1. The command can have a mandatory argument
|
|
;; enclosed by braces which is captured by group 2. If the command has
|
|
;; no arguments in brackets or braces, the first non-white space
|
|
;; letter after the command is captured in group 3. This is to be able
|
|
;; to deal with accents. Note that the capturing of arguments in
|
|
;; braces is imperfect, because doing it properly requires sexp
|
|
;; parsing. It will fail for cases like \command{\anothercommand{an
|
|
;; arg}some text}.
|
|
. ,(rx "\\" (group-n 1 (or (1+ letter) nonl))
|
|
(: (* blank) (opt (or (: (* (: "[" (* (not (any "]"))) "]"))
|
|
"{" (group-n 2 (0+ (not (any "}")))) (opt "}"))
|
|
(group-n 3 letter))))))
|
|
(parsebib--TeX-replace-literal
|
|
. ,(rx (or (regexp (regexp-opt (mapcar #'car parsebib-TeX-literal-replacement-alist)))
|
|
(1+ blank)))))
|
|
"Alist of replacements and strings for TeX markup.
|
|
This is used in `parsebib-clean-TeX-markup' to make TeX markup more
|
|
suitable for display. Each item in the list consists of a replacement
|
|
and a regexp. The replacement can be a string (which will
|
|
simply replace the match) or a function (the match will be
|
|
replaced by the result of calling the function on the match
|
|
string). Earlier elements are evaluated before later ones, so if
|
|
one string is a subpattern of another, the second must appear
|
|
later (e.g. \"''\" is before \"'\").
|
|
|
|
For the common cases of replacing a LaTeX command or a literal
|
|
it is faster to use `parsebib-TeX-command-replacement-alist'
|
|
and `parsebib-TeX-literal-replacement-alist' respectively.")
|
|
|
|
(defun parsebib--TeX-replace-command-or-accent (string)
|
|
"Return the replacement text for the command or accent matched by STRING."
|
|
(let* ((cmd (match-string 1 string))
|
|
;; bar is the argument in braces.
|
|
(bar (match-string 2 string))
|
|
;; If there is no argument in braces, consider the letter after
|
|
;; the command as the argument. Clean this argument.
|
|
(arg (parsebib-clean-TeX-markup (or (if bar bar (match-string 3 string)) "")))
|
|
;; Check if the cmd is an accent that needs to be replaced
|
|
;; and get its replacement.
|
|
(acc (alist-get cmd parsebib-TeX-accent-replacement-alist nil nil #'equal))
|
|
;; If it is not an accent, check if it is a command that needs to be replaced
|
|
;; and get the replacement.
|
|
(rep (or acc (alist-get cmd parsebib-TeX-command-replacement-alist nil nil #'equal))))
|
|
(cond
|
|
;; If replacement is a function, call it with the argument.
|
|
((functionp rep) (funcall rep arg))
|
|
;; Otherwise combine the replacement with the argument. The order of combination
|
|
;; depends on whether the command is an accent or not.
|
|
(rep (if acc (concat arg rep) (concat rep arg)))
|
|
;; Now we handle the fallback cases. If there is a braced argument but no
|
|
;; replacement for the command was found, consider the replacement to be
|
|
;; empty.
|
|
((and bar (not (equal "" bar))) bar)
|
|
;; Otherwise clean any optional arguments by discarding them.
|
|
(t (replace-regexp-in-string (rx "[" (* (not (any "]"))) "]") "" string t t)))))
|
|
|
|
(defun parsebib--TeX-replace-literal (string)
|
|
"Look up the replacement text for literal STRING."
|
|
(or (alist-get string parsebib-TeX-literal-replacement-alist nil nil #'equal)
|
|
" "))
|
|
|
|
(defun parsebib-clean-TeX-markup (string)
|
|
"Return STRING without TeX markup.
|
|
Any substring matching the car of a cell in
|
|
`parsebib-TeX-markup-replace-alist' is replaced with the
|
|
corresponding cdr (if the cdr is a string), or with the result of
|
|
calling the cdr on the match (if it is a function)."
|
|
(let ((case-fold-search nil))
|
|
(cl-loop for (replacement . pattern) in parsebib-TeX-markup-replacement-alist
|
|
do (setq string (replace-regexp-in-string
|
|
pattern replacement string
|
|
t t))
|
|
finally return string)))
|
|
|
|
;;;; High-level BibTeX/biblatex API
|
|
|
|
(defun parsebib-collect-preambles ()
|
|
"Collect all @Preamble definitions in the current buffer.
|
|
Return a list of strings, each string a separate @Preamble."
|
|
(save-excursion
|
|
(goto-char (point-min))
|
|
(let (res)
|
|
(cl-loop for item = (parsebib-find-next-item)
|
|
while (and item
|
|
(cl-equalp item "preamble"))
|
|
do (push (parsebib--@preamble) res))
|
|
(nreverse res))))
|
|
|
|
(defun parsebib-collect-comments ()
|
|
"Collect all @Comment definitions in the current buffer.
|
|
Return a list of strings, each string a separate @Comment."
|
|
(save-excursion
|
|
(goto-char (point-min))
|
|
(let (res)
|
|
(cl-loop for item = (parsebib-find-next-item)
|
|
while (and item
|
|
(cl-equalp item "comment"))
|
|
do (push (parsebib--@comment) res))
|
|
(nreverse (delq nil res)))))
|
|
|
|
(cl-defun parsebib-collect-strings (&key strings expand-strings)
|
|
"Collect all @String definitions in the current buffer.
|
|
Return value is a hash with the abbreviations as keys and the
|
|
expansions as values. If STRINGS is a hash table with test
|
|
function `equal', it is used to store the @String definitions.
|
|
If EXPAND-STRINGS is non-nil, @String expansions are expanded
|
|
themselves using the @String definitions already stored in
|
|
STRINGS."
|
|
(or (and (hash-table-p strings)
|
|
(eq 'equal (hash-table-test strings)))
|
|
(setq strings (make-hash-table :test #'equal)))
|
|
(save-excursion
|
|
(goto-char (point-min))
|
|
(cl-loop with string = nil
|
|
for item = (parsebib-find-next-item)
|
|
while (and item
|
|
(cl-equalp item "string"))
|
|
do
|
|
(setq string (parsebib-read-string (if expand-strings strings)))
|
|
(puthash (car string) (cdr string) strings))
|
|
strings))
|
|
|
|
(cl-defun parsebib-collect-bib-entries (&key entries strings inheritance fields)
|
|
"Collect all BibTeX / biblatex entries in the current buffer.
|
|
Return value is a hash table containing the entries. If ENTRIES
|
|
is a hash table with test function `equal', it is used to store
|
|
the entries collected in the buffer. Note that ENTRIES does not
|
|
have to be empty. It may contain entries from a previous parse.
|
|
|
|
If STRINGS is non-nil, it should be a hash table of string
|
|
definitions, which are used to expand abbreviations used in the
|
|
entries. In addition, if STRINGS is set, sequences of whitespace
|
|
in field values are collapsed into a single space, field values
|
|
are unquoted (i.e., the double quotes or braces around them are
|
|
removed), and TeX markup is prettified (see
|
|
`parsebib-clean-TeX-markup' for details). Note that @String
|
|
expansion, collapsing of whitespace and prettifying TeX markup
|
|
are not applied to fields listed in
|
|
`parsebib-postprocessing-excluded-fields', but unquoting is.
|
|
|
|
If INHERITANCE is non-nil, cross-references in the entries are
|
|
resolved: if the crossref field of an entry points to an entry
|
|
already in ENTRIES (which includes the entries that appear
|
|
earlier in the buffer), the fields of the latter that do not occur
|
|
in the entry are added to it. INHERITANCE indicates the
|
|
inheritance schema used for determining which fields inherit from
|
|
which fields. It can be a symbol `BibTeX' or `biblatex', or it
|
|
can be an explicit inheritance schema. (See the variable
|
|
`parsebib--biblatex-inheritances' for details on the structure of
|
|
such an inheritance schema.) It can also be the symbol t, in
|
|
which case the local variable block is checked for a
|
|
dialect (using the variable `bibtex-dialect'), or, if no such
|
|
local variable is found, the value of the variable
|
|
`bibtex-dialect'.
|
|
|
|
FIELDS is a list of the field names (as strings) to be read and
|
|
included in the result. Fields not in the list are ignored,
|
|
except \"=key=\" and \"=type=\", which are always included. Case
|
|
is ignored when comparing fields to the list in FIELDS. If
|
|
FIELDS is nil, all fields are returned."
|
|
(or (and (hash-table-p entries)
|
|
(eq 'equal (hash-table-test entries)))
|
|
(setq entries (make-hash-table :test #'equal)))
|
|
(if (eq inheritance t)
|
|
(setq inheritance (or (parsebib-find-bibtex-dialect)
|
|
(and (boundp 'bibtex-dialect) bibtex-dialect)
|
|
'BibTeX)))
|
|
;; Ensure =key= and =type= are in `fields'.
|
|
(if fields
|
|
(setq fields (append (list "=key=" "=type=" fields))))
|
|
(condition-case err
|
|
(save-excursion
|
|
(goto-char (point-min))
|
|
(cl-loop with entry = nil
|
|
for entry-type = (parsebib-find-next-item)
|
|
while entry-type do
|
|
(unless (member-ignore-case entry-type '("preamble" "string" "comment"))
|
|
(setq entry (parsebib-read-entry fields strings (not (null strings))))
|
|
(if entry
|
|
(puthash (cdr (assoc-string "=key=" entry)) entry entries))))
|
|
(when inheritance
|
|
(parsebib-expand-xrefs entries inheritance))
|
|
entries)
|
|
(parsebib-error
|
|
(save-excursion
|
|
(goto-char (cadr err))
|
|
(signal (car err) (list (concat (apply #'format (cddr err))
|
|
(format " at position (%d,%d)" (line-number-at-pos) (current-column)))))))))
|
|
|
|
(defun parsebib-find-bibtex-dialect ()
|
|
"Find the BibTeX dialect of a file if one is set.
|
|
This function looks for a local value of the variable
|
|
`bibtex-dialect' in the local variable block at the end of the
|
|
file. Return nil if no dialect is found."
|
|
(save-excursion
|
|
(goto-char (point-max))
|
|
(let ((case-fold-search t)
|
|
(bibtex-dialect-list (or (and (boundp 'bibtex-dialect-list)
|
|
bibtex-dialect-list)
|
|
'(BibTeX biblatex))))
|
|
(when (re-search-backward (concat parsebib--bibtex-entry-start "comment") (- (point-max) 3000) t)
|
|
(let ((comment (parsebib--@comment)))
|
|
(when (and comment
|
|
(string-match-p "\\`{[ \n\t\r]*Local Variables:" comment)
|
|
(string-match-p "End:[ \n\t\r]*}\\'" comment)
|
|
(string-match (concat "bibtex-dialect: " (regexp-opt (mapcar #'symbol-name bibtex-dialect-list) t)) comment))
|
|
(intern (match-string 1 comment))))))))
|
|
|
|
(cl-defun parsebib-parse-bib-buffer (&key entries strings expand-strings inheritance fields replace-TeX)
|
|
"Parse the current buffer and return all BibTeX data.
|
|
Return a list of five elements: a hash table with the entries, a
|
|
hash table with the @String definitions, a list of @Preamble
|
|
definitions, a list of @Comments and the BibTeX dialect, if
|
|
present in the file.
|
|
|
|
If ENTRIES is a hash table with test function `equal', it is used
|
|
to store the entries. Any existing entries with identical keys
|
|
are overwritten. Similarly, if STRINGS is a hash table with test
|
|
function `equal', the @String definitions are stored in it.
|
|
|
|
If EXPAND-STRINGS is non-nil, abbreviations in the entries and
|
|
@String definitions are expanded using the @String definitions
|
|
already in STRINGS. In addition, sequences of whitespace in
|
|
field values are collapsed into a single space and field values
|
|
are unquoted, i.e., the double quotes or braces around them are
|
|
removed. Note that @String expansion, collapsing of whitespace
|
|
and prettifying TeX markup are not applied to fields listed in
|
|
`parsebib-postprocessing-excluded-fields', but unquoting is.
|
|
|
|
If INHERITANCE is non-nil, cross-references in the entries are
|
|
resolved: if the crossref field of an entry points to an entry
|
|
already in ENTRIES, the fields of the latter that do not occur in
|
|
the entry are added to it. INHERITANCE indicates the inheritance
|
|
schema used for determining which fields inherit from which
|
|
fields. It can be a symbol `BibTeX' or `biblatex', which means
|
|
to use the default inheritance schema for either dialect, or it
|
|
can be an explicit inheritance schema. (See the variable
|
|
`parsebib--biblatex-inheritances' for details on the structure of
|
|
such an inheritance schema.) It can also be the symbol t, in
|
|
which case the local variable block is checked for a
|
|
dialect (using the variable `bibtex-dialect'), or, if no such
|
|
local variable is found, the value of the variable
|
|
`bibtex-dialect'.
|
|
|
|
FIELDS is a list of the field names (as strings) to be read and
|
|
included in the result. Fields not in the list are ignored,
|
|
except \"=key=\" and \"=type=\", which are always included. Case
|
|
is ignored when comparing fields to the list in FIELDS. If
|
|
FIELDS is nil, all fields are returned.
|
|
|
|
REPLACE-TEX indicates whether TeX markup should be replaced with
|
|
ASCII/Unicode characters. See the variable
|
|
`parsebib-TeX-markup-replace-alist' for details."
|
|
(or (and (hash-table-p entries)
|
|
(eq (hash-table-test entries) 'equal))
|
|
(setq entries (make-hash-table :test #'equal)))
|
|
(or (and (hash-table-p strings)
|
|
(eq (hash-table-test strings) 'equal))
|
|
(setq strings (make-hash-table :test #'equal)))
|
|
;; Ensure =key= and =type= are in `fields'.
|
|
(if fields
|
|
(setq fields (append (list "=key=" "=type=") fields)))
|
|
(condition-case err
|
|
(let ((dialect (or (parsebib-find-bibtex-dialect)
|
|
(and (boundp 'bibtex-dialect) bibtex-dialect)
|
|
'BibTeX))
|
|
preambles comments)
|
|
(save-excursion
|
|
(goto-char (point-min))
|
|
(cl-loop for item = (parsebib-find-next-item)
|
|
while item do
|
|
(cond
|
|
((cl-equalp item "string") ; `cl-equalp' compares strings case-insensitively.
|
|
(let ((string (parsebib-read-string (if expand-strings strings))))
|
|
(if string
|
|
(puthash (car string) (cdr string) strings))))
|
|
((cl-equalp item "preamble")
|
|
(push (parsebib--@preamble) preambles))
|
|
((cl-equalp item "comment")
|
|
(push (parsebib--@comment) comments))
|
|
((stringp item)
|
|
(let ((entry (parsebib-read-entry fields (if expand-strings strings) replace-TeX)))
|
|
(when entry
|
|
(puthash (cdr (assoc-string "=key=" entry)) entry entries))))))
|
|
(when inheritance (parsebib-expand-xrefs entries (if (eq inheritance t) dialect inheritance)))
|
|
(list entries strings (nreverse preambles) (nreverse comments) dialect)))
|
|
(parsebib-error
|
|
(save-excursion
|
|
(goto-char (cadr err))
|
|
(signal (car err) (list (concat (apply #'format (cddr err))
|
|
(format " at position (%d,%d)" (line-number-at-pos) (current-column)))))))))
|
|
|
|
;;;; CSL-JSON API
|
|
|
|
(cl-defun parsebib-parse-json-buffer (&key entries stringify year-only fields)
|
|
"Parse the current buffer and return all CSL-JSON data.
|
|
The return value is a hash table containing all the elements.
|
|
The hash table's keys are the \"id\" values of the entries, the
|
|
hash table's values are alists as returned by `json-parse-buffer'
|
|
or `json-read'
|
|
|
|
If ENTRIES is a hash table with test function `equal', it is used
|
|
to store the entries. Any existing entries with identical keys
|
|
are overwritten.
|
|
|
|
If STRINGIFY is non-nil, JSON values that are not
|
|
strings (notably name and date fields) are converted to strings.
|
|
If additionally YEAR-ONLY is non-nil, dates are shortened to just
|
|
the year part.
|
|
|
|
FIELDS is a list of field names (as symbols) to be read and
|
|
included in the result. Fields not in the list are ignored,
|
|
except `id' and `type', which are always included. If FIELDS is
|
|
nil, all fields are returned.
|
|
|
|
If a JSON object is encountered that does not have an \"id\"
|
|
field, a `parsebib-error' is raised."
|
|
(or (and (hash-table-p entries)
|
|
(eq (hash-table-test entries) 'equal))
|
|
(setq entries (make-hash-table :test #'equal)))
|
|
(when fields
|
|
(setq fields (append '(id type) fields)))
|
|
(let ((parse (if (and (fboundp 'json-serialize)
|
|
(json-serialize '((test . 1)))) ; Returns nil if native json support isn't working for some reason.
|
|
(lambda ()
|
|
(json-parse-buffer :object-type 'alist))
|
|
(lambda ()
|
|
(let ((json-object-type 'alist))
|
|
(json-read))))))
|
|
;; We do not read the entire file in one go, but instead parse each entry
|
|
;; separately. Large bibliographies would otherwise be returned as one
|
|
;; gigantic vector, which then needs to be converted to a hash table. If we
|
|
;; need to convert some of the data because `stringify' is t, the data is
|
|
;; held in memory twice.
|
|
(save-excursion
|
|
(goto-char (point-min))
|
|
;; JSON is pretty strict, not even comments are allowed. CSL-JSON
|
|
;; requires that the file is essentially one big array, so we know that
|
|
;; the first non-whitespace character in the file must be an opening
|
|
;; bracket;
|
|
(if (not (looking-at-p "[\n\t ]*\\["))
|
|
(error "[Parsebib Error] Not a valid CSL-JSON file"))
|
|
(let ((continue t))
|
|
(while continue
|
|
;; We also know that the first non-whitespace character after that
|
|
;; must be an opening brace:
|
|
(skip-chars-forward "^{")
|
|
(if-let* ((entry (funcall parse))
|
|
(id (alist-get 'id entry)))
|
|
(progn
|
|
(when fields
|
|
(setq entry (seq-filter (lambda (elt)
|
|
(memq (car elt) fields))
|
|
entry)))
|
|
(puthash id (if stringify
|
|
(parsebib-stringify-json entry year-only)
|
|
entry)
|
|
entries))
|
|
(signal 'parsebib-error (list (format "Malformed JSON entry at position (%d,%d)"
|
|
(line-number-at-pos) (current-column)))))
|
|
;; Parsing an entry moves point to the end of the entry. The next
|
|
;; character must be a comma if there is another entry. If we're not
|
|
;; seeing a comma, we've reached the end of the file:
|
|
(if (not (looking-at-p "[\n\t ]*,"))
|
|
(setq continue nil))))))
|
|
entries)
|
|
|
|
(defun parsebib-stringify-json (entry &optional year-only)
|
|
"Return ENTRY with all non-string values converted to strings.
|
|
ENTRY is a CSL-JSON entry in the form of an alist. ENTRY is
|
|
modified in place. Return value is ENTRY. If YEAR-ONLY is
|
|
non-nil, date fields are shortened to just the year."
|
|
(dolist (field entry)
|
|
(unless (stringp (alist-get (car field) entry))
|
|
(setf (alist-get (car field) entry)
|
|
(parsebib-stringify-json-field (assq (car field) entry) year-only))))
|
|
entry)
|
|
|
|
(defvar parsebib--json-name-fields '(author
|
|
collection-editor
|
|
composer
|
|
container-author
|
|
director
|
|
editor
|
|
editorial-director
|
|
illustrator
|
|
interviewer
|
|
original-author
|
|
recipient
|
|
reviewed-author
|
|
translator))
|
|
|
|
(defvar parsebib--json-date-fields '(accessed
|
|
container
|
|
event-date
|
|
issued
|
|
original-date
|
|
submitted))
|
|
|
|
(defvar parsebib--json-number-fields '(chapter-number
|
|
collection-number
|
|
edition
|
|
issue
|
|
number
|
|
number-of-pages
|
|
number-of-volumes
|
|
volume))
|
|
|
|
(defvar parsebib-json-name-field-template "{non-dropping-particle }{family, }{given}{ dropping-particle}{, suffix}{literal}"
|
|
"Template used to display name fields.")
|
|
|
|
(defvar parsebib-json-name-field-separator " and "
|
|
"Separator used to concatenate names in a name field.")
|
|
|
|
(defvar parsebib-json-field-separator ", "
|
|
"Separator used to concatenate items of array fields.")
|
|
|
|
(defun parsebib--process-template (template items)
|
|
"Process TEMPLATE and return a formatted string.
|
|
ITEMS is an alist, the keys of which may occur in TEMPLATE.
|
|
Braced occurrences of the keys in ITEMS are replaced with the
|
|
corresponding values. Note that the keys in ITEMS should be
|
|
symbols."
|
|
(cl-flet ((create-replacements (match)
|
|
(save-match-data
|
|
(string-match "{\\([^A-Za-z]*\\)\\([A-Za-z][A-za-z-]+\\)\\([^A-Za-z]*\\)}" match)
|
|
(let* ((pre (match-string 1 match))
|
|
(key (match-string 2 match))
|
|
(post (match-string 3 match))
|
|
(value (alist-get (intern key) items)))
|
|
(if value
|
|
(format "%s%s%s" pre value post)
|
|
"")))))
|
|
(replace-regexp-in-string "{.*?}" #'create-replacements template nil t)))
|
|
|
|
(defun parsebib-stringify-json-field (field &optional short)
|
|
"Return the value of FIELD as a string.
|
|
FIELD is a cons cell that constitutes a CSL-JSON field-value
|
|
pair. The car is the key, the cdr the value. If the value is a
|
|
string, return it with sequences of white space reduced to a
|
|
single space. Otherwise, convert it into a string. SHORT is
|
|
only relevant for date fields: if it is non-nil, return just a
|
|
year, or the string \"XXXX\" if no year part is present."
|
|
(let ((key (car field))
|
|
(value (cdr field)))
|
|
(cond
|
|
((stringp value)
|
|
(replace-regexp-in-string "[ \t\n\f[:space:]]+" " " value))
|
|
|
|
((numberp value)
|
|
(format "%s" value))
|
|
|
|
((memq key parsebib--json-name-fields)
|
|
(parsebib--json-stringify-name-field value))
|
|
|
|
((memq key parsebib--json-date-fields)
|
|
(parsebib--json-stringify-date-field value short))
|
|
|
|
;; In CSL-JSON v1.0, the only array field besides name and date fields
|
|
;; is "categories". It has an array of strings as value, so the `format'
|
|
;; isn't strictly necessary. We do it this way just to be on the safe
|
|
;; side.
|
|
((arrayp value)
|
|
(mapconcat (lambda (e) (format "%s" e)) value parsebib-json-field-separator))
|
|
|
|
;; This clause should never be reached.
|
|
(t (replace-regexp-in-string "\n" " " (format "%s" value))))))
|
|
|
|
(defun parsebib--json-stringify-name-field (names)
|
|
"Convert NAMES to a string.
|
|
NAMES is the value of a CSL-JSON name field, a vector of alists.
|
|
Conversion is done on the basis of
|
|
`parsebib-json-name-field-template': each field in this template
|
|
is replaced with the value of the field in NAME. Fields that
|
|
have no value in NAME are ignored."
|
|
(mapconcat (lambda (name)
|
|
(parsebib--process-template parsebib-json-name-field-template name))
|
|
names
|
|
parsebib-json-name-field-separator))
|
|
|
|
(defun parsebib--json-stringify-date-field (date &optional short)
|
|
"Convert DATE to a string.
|
|
DATE is the value of a CSL-JSON date field. If SHORT is non-nil,
|
|
try to return only a year (in a date range, just the year of the
|
|
first date). If no year part is present, SHORT returns
|
|
\"XXXX\"."
|
|
(if short
|
|
(if-let* ((date-parts (alist-get 'date-parts date))
|
|
(first-date (aref date-parts 0))
|
|
(year (aref first-date 0)))
|
|
(format "%s" year)
|
|
"XXXX")
|
|
|
|
;; Work with a copy of the original alist.
|
|
(setq date (copy-sequence date))
|
|
|
|
;; Set start-date and end-date.
|
|
(when-let* ((date-parts (alist-get 'date-parts date)))
|
|
(let* ((start-date (aref date-parts 0))
|
|
(end-date (if (= (length date-parts) 2)
|
|
(aref date-parts 1))))
|
|
(setf (alist-get 'date-parts date nil :remove) nil)
|
|
(setf (alist-get 'start-date date)
|
|
(parsebib--json-stringify-date-part start-date))
|
|
(if end-date (setf (alist-get 'end-date date)
|
|
(parsebib--json-stringify-date-part end-date)))))
|
|
|
|
;; Set season.
|
|
(when-let* ((season (alist-get 'season date)))
|
|
(if (numberp season)
|
|
(setf (alist-get 'season date)
|
|
(aref ["Spring" "Summer" "Autumn" "Winter"] (1- season)))))
|
|
|
|
;; Set circa.
|
|
(when-let* ((circa (alist-get 'circa date)))
|
|
(setf (alist-get 'circa date) "ca."))
|
|
|
|
;; Now convert the date.
|
|
(parsebib--process-template "{circa }{season }{start-date}{/end-date}{literal}{raw}"
|
|
date)))
|
|
|
|
(defun parsebib--json-stringify-date-part (date-parts)
|
|
"Convert DATE-PARTS into a string.
|
|
DATE-PARTS is a sequence with up to three numeric elements: a
|
|
year, a month and a day."
|
|
(parsebib--process-template "{year}{-month}{-day}"
|
|
(seq-mapn #'cons '(year month day) date-parts)))
|
|
|
|
;;;; Format-independent API
|
|
|
|
(cl-defun parsebib-parse (files &key entries strings (display t) fields)
|
|
"Parse one or more bibliography files.
|
|
FILES is the list of files to parse. All bibliographic entries
|
|
in FILES are collected and returned in a single hash table.
|
|
FILES can be a list of `.bib' or `.json' files, or a combination
|
|
of these. FILES can also be a string, which should be the path
|
|
to a single bibliography file.
|
|
|
|
ENTRIES, if provided, should be a hash table with test function
|
|
`equal', it is used to store the entries. Any existing entries
|
|
with identical keys are overwritten. If provided, ENTRIES is
|
|
also the return value. If ENTRIES is nil, a new hash table is
|
|
created and returned.
|
|
|
|
STRINGS, similarly a hash table with test function `equal', is
|
|
used to store the @String definitions. Although STRINGS is not
|
|
returned, it is modified in place and can therefore be used to
|
|
collect the @String definitions in the files being parsed.
|
|
|
|
If DISPLAY is non-nil, field values are returned in a way that is
|
|
suitable for display: in `.bib' files, @String abbreviations are
|
|
expanded, in `.json' files, values that are not strings are
|
|
converted to strings. Furthermore, sequences of white space
|
|
characters (including newlines) are reduced to a single space.
|
|
|
|
Specifically, setting DISPLAY means setting the arguments
|
|
EXPAND-STRINGS and INHERITANCES in the function
|
|
`parsebib-parse-bib-buffer' and setting STRINGIFY and YEAR-ONLY
|
|
in the function `parsebib-parse-json-buffer'. DISPLAY is simply
|
|
passed on to these arguments, which means that it can be set to
|
|
anything that INHERITANCES in `parsebib-parse-bib-buffer'
|
|
accepts. (The other arguments only distinguish between nil and
|
|
non-nil.) Note that DISPLAY defaults to t.
|
|
|
|
FIELDS is a list of the field names to be read and included in
|
|
the result. Fields not in the list are ignored. Note that field
|
|
names should be strings; when parsing a `.json' file, they are
|
|
converted to symbols. See the doc strings of
|
|
`parsebib-parse-bib-buffer' and `parsebib-parse-json-buffer' for
|
|
details. If FIELDS is nil, all fields are returned."
|
|
(or (and (hash-table-p entries)
|
|
(eq (hash-table-test entries) 'equal))
|
|
(setq entries (make-hash-table :test #'equal)))
|
|
(or (and (hash-table-p strings)
|
|
(eq (hash-table-test strings) 'equal))
|
|
(setq strings (make-hash-table :test #'equal)))
|
|
(when (stringp files)
|
|
(setq files (list files)))
|
|
(dolist (file files)
|
|
(with-temp-buffer
|
|
(insert-file-contents file)
|
|
(cond
|
|
((string= (file-name-extension file t) ".bib")
|
|
(parsebib-parse-bib-buffer :entries entries
|
|
:strings strings
|
|
:expand-strings display
|
|
:inheritance display
|
|
:fields fields
|
|
:replace-TeX display))
|
|
((string= (file-name-extension file t) ".json")
|
|
(parsebib-parse-json-buffer :entries entries
|
|
:stringify display
|
|
:year-only display
|
|
:fields (mapcar #'intern fields)))
|
|
(t (error "[Parsebib] Not a bibliography file: %s" file)))))
|
|
entries)
|
|
|
|
(provide 'parsebib)
|
|
|
|
;;; parsebib.el ends here
|