emacs/lisp/org-ref/org-ref-extract.el

;;; org-ref-extract.el --- Extract BibTeX from HTML  -*- lexical-binding: t; -*-

;; Copyright (C) 2023  Justus Piater

;; Author: Justus Piater <Justus-dev@Piater.name>
;; Keywords:

;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program.  If not, see <http://www.gnu.org/licenses/>.

;;; Commentary:

;;

;;; Code:

(declare-function org-ref-clean-bibtex-entry "org-ref-bibtex" ())
(declare-function xml-substitute-special "xml" ())


(defun org-ref--extract (html-buffer rx num)
  "Return content matched within HTML-BUFFER by RX at parenthesized
sub-expression NUM."
  (with-current-buffer html-buffer
    (goto-char (point-min))
    (if (re-search-forward rx nil t)
	(match-string num)
      nil)))


(defun org-ref--get-pdf (pdf-url)
  "For BibTeX entry at point, if not already present, get PDF, place
it in`bibtex-completion-library-path', and add a corresponding
FILE field to the entry."
  (bibtex-beginning-of-entry)
  (let* ((key (cdr (assoc "=key=" (bibtex-parse-entry))))
         (pdf-file (concat (car bibtex-completion-library-path) key ".pdf")))
    (unless (file-exists-p pdf-file)
      (url-copy-file pdf-url pdf-file)
      (if (org-ref-pdf-p pdf-file)
          (message "%s saved" pdf-file)
        (delete-file pdf-file)
        (message "No pdf was downloaded.")
        (browse-url pdf-url)))
    (when (file-exists-p pdf-file)
      (bibtex-set-field "file" pdf-file)
      (when doi-utils-open-pdf-after-download
	(org-open-file pdf-file)))))


(defun org-ref--extract-entry-from-html
    (html-buffer bibtex pdf-url &rest more-fields)
  "At point, create a BibTeX entry using information extracted
  from the HTML-BUFFER, and kill HTML-BUFFER."
  (bibtex-mode)
  (let ((bibtex (if (consp bibtex)
		    (org-ref--extract html-buffer (car bibtex) (cdr bibtex))
		  bibtex))
	(pdf-url (if (consp pdf-url)
		     (org-ref--extract html-buffer (car pdf-url) (cdr pdf-url))
		   pdf-url))
	(more-fields
	 (mapcar
	  (lambda (field)
	    (cons (car field)
		  (if (consp (cdr field))
		      (org-ref--extract html-buffer (cadr field) (cddr field))
		    (cdr field))))
	  more-fields)))
    (insert bibtex)
    (goto-char (point-min))
    (while (search-forward "{\\n" nil t)
      (replace-match "{"))
    (goto-char (point-min))
    (while (search-forward "\\n" nil t)
      (replace-match "\n"))
    (org-ref-clean-bibtex-entry)
    (dolist (pair more-fields)
      (when (cdr pair)
	(bibtex-set-field (car pair) (cdr pair))))
    (org-ref--get-pdf pdf-url))
  (kill-buffer html-buffer))


(defun org-ref--html-buffer (url)
  "Retrieve resource from URL, decode it, substitute XML entities,
and return the buffer."
  (with-current-buffer (generate-new-buffer "org-ref--html")
    (let ((url-request-method "GET"))
      (url-insert (url-retrieve-synchronously url)))
    (goto-char (point-min))
    (insert (xml-substitute-special (buffer-string)))
    (delete-region (point) (point-max))
    (current-buffer)))


(defun org-ref-extract-from-openreview (id)
  "At point, create a BibTeX entry for the given OpenReview ID."
  (interactive "MOpenReview ID: ")
  (let* ((url (concat "https://openreview.net/forum?id=" id))
	 (html-buffer (org-ref--html-buffer url)))
    (with-current-buffer html-buffer
      (replace-string-in-region "\\\\n" "\\n" (point-min) (point-max)))
    (org-ref--extract-entry-from-html
     html-buffer
     '("\\\\\"_bibtex\\\\\":\\({\\\\\"value\\\\\":\\)?\\\\\"\\(@.+?}\\)\\\\\""
       . 2)
     (replace-regexp-in-string "forum" "pdf" url)
     '("abstract" .
       ("<meta name=\"citation_abstract\" content=\"\\(.+?\\(\n.*?\\)*?\\)\"/>" . 1))
     '("area" .
       ("\"Please_choose_the_closest_area_that_your_submission_falls_into\":\"\\(.+?\\)\"" . 1))
     '("keywords" . ("Keywords.*?\"note-content-value\">\\(.+?\\)</span>" . 1))
     '("summary" .
       ("\\(Summary\\|TL;DR\\).*?\"note-content-value\">\\(.+?\\)</span>" . 2))
     ;; Should we proactively download supplementary materials too?
     (cons "supp"
	   (if-let* ((supp (org-ref--extract
			   html-buffer
			   ">Supplementary Material<.*?href=\"\\([^\"]+\\)" 1)))
	       (concat "https://openreview.net" supp))))))


(defun org-ref-extract-from-pmlr (url)
  "At point, create a BibTeX entry for the given PMLR URL."
  (interactive "MPMLR URL: ")
  (org-ref--extract-entry-from-html
   (org-ref--html-buffer url)
   '("id=\"bibtex\">\n\\(@.+\\(\n.*?\\)+?\\)\n</" . 1)
   '("{\\(http.+\\.pdf\\)}" . 1)
   ;; Should we proactively download supplementary materials too?
   '("supp" . ("href=\"\\(https?://proceedings\\.mlr\\.press/[^\"]+?-supp[^\"]*?\\)\".*?>Supplementary PDF</" . 1))))


(defun org-ref-extract-from-neurips (url)
  "At point, create a BibTeX entry for the given NeurIPS Abstract URL."
  (interactive "MNeurIPS Abstract URL: ")
  (let ((hash (progn (string-match "/\\([0-9a-f]+\\)-" url)
		     (match-string 1 url)))
	(neurips-url "https://proceedings.neurips.cc")
	(html-buffer (org-ref--html-buffer url))
	(bibtex))
    (with-current-buffer html-buffer
      (goto-char (point-min))
      (re-search-forward "href=[\"']\\([^\"']+bibtex[^\"']*\\)[\"']")
      (let ((bibtex-url (match-string 1)))
	(with-temp-buffer
	  (url-insert
	   (url-retrieve-synchronously (concat neurips-url bibtex-url)))
	  (setq bibtex (buffer-string)))))
    (org-ref--extract-entry-from-html
     html-buffer
     bibtex
     (concat neurips-url
	     (org-ref--extract html-buffer
			       "href=[\"']\\([^\"']+-Paper[^\"']*\\)[\"']" 1))
     (cons "url" url)
     '("abstract" . ("<h4>Abstract</h4>[ \n]*?\\(<p>\\)+\\(.+?\\)</p>" . 2))
     ;; Should we proactively download supplementary materials too?
     (cons "supp"
	   (if-let*
	       ((supp (org-ref--extract
		       html-buffer
		       "href=[\"']\\([^\"']+-Supplemental[^\"']*\\)[\"']" 1)))
	       (concat neurips-url supp))))))


(defun org-ref-extract-from-cvf (url)
  "At point, create a BibTeX entry for the given CVF HTML URL."
  (interactive "MCVF HTML URL: ")
  (let ((cvf-url "https://openaccess.thecvf.com")
	(html-buffer (org-ref--html-buffer url)))
    (org-ref--extract-entry-from-html
     html-buffer
     '("class=\"bibref[^\"]*\">[ \n]*\\(@.+?\\(\n.*?\\)+?\\)[ \n]*</" . 1)
     (concat cvf-url (org-ref--extract
		      html-buffer "<a href=[\"']\\([^\"']+\\)[\"']>pdf</a>" 1))
     (cons "url" url)
     '("abstract" . ("id=\"abstract\">[ \n]*\\([^<]+\\)[ \n]*</" . 1))
     ;; Should we proactively download supplementary materials too?
     (cons "supp" (concat cvf-url
			  (org-ref--extract html-buffer
					    "href=[\"']\\([^\"']+\\)[\"']>supp</"
					    1))))))


(provide 'org-ref-extract)
;;; org-ref-extract.el ends here