Allow Pandoc to parse metadata rather than doing it in the plugin.

zackw · zackw · commit ce9dc36efc57 · 2015-03-24T19:29:32.000-04:00
This involves a fairly complicated dance with a Pandoc "filter"
module in order to get all of the metadata to be visible in the
output, but means that all metadata formats supported by Pandoc
are available without the need for any additional Python modules.
It also means strings in metadata will be processed as Markdown.
diff --git a/README.md b/README.md
@@ -1,25 +1,23 @@
 pandoc_reader
 =============
 
-A pandoc [markdown] reader plugin for [pelican]
+A pandoc [markdown][] reader plugin for [pelican][]
 
 
 Requirements
 ------------
 
-  - [pandoc] in $PATH
-
+  - [pandoc][] in `$PATH`
 
 Installation
 ------------
 
 Instructions for installation of pelican plugins can be obtained from the [pelican plugin manual](https://github.com/getpelican/pelican-plugins/blob/master/Readme.rst).
 
-
 Configuration
 -------------
 
-Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter.
+Additional command line parameters can be passed to pandoc via the `PANDOC_ARGS` parameter.
 
     PANDOC_ARGS = [
       '--mathjax',
@@ -30,13 +28,15 @@ Additional command line parameters can be passed to pandoc via the PANDOC_ARGS p
     ]
 
 Pandoc's markdown extensions can be enabled or disabled via the
-PANDOC_EXTENSIONS parameter.
+`PANDOC_EXTENSIONS` parameter.
 
     PANDOC_EXTENSIONS = [
       '+hard_line_breaks',
       '-citations'
     ]
 
+In addition to Pandoc's own default set of extensions, this plugin enables the `mmd_title_block` extension by default.  It can be disabled with `'-mmd_title_block'` as usual.
+
 Contributing
 ------------
 
diff --git a/embed_metadata_filter.py b/embed_metadata_filter.py
@@ -0,0 +1,47 @@
+# This is a filter script which embeds all of the metadata parsed by
+# Pandoc into the HTML output, where the main body of the reader can
+# pick it up.  In order to preserve Pandoc's translation of Markdown
+# in metadata values, we convert the metadata structure into an HTML
+# tree structure.  A <hr> separates the translated metadata from the
+# document itself.
+#
+# See http://johnmacfarlane.net/pandoc/scripting.html for documentation
+# of the JSON-serialized AST that we are manipulating.
+
+import json
+import sys
+
+def N(t, c, cls=None):
+    if cls is not None: c = [ ["", [cls], []], c ]
+    return { "t": t, "c": c }
+
+def cvt_metainlines(c):
+    return N("Plain", [N("Span", c, "metavalue")])
+
+def cvt_metamap(c):
+    return N("DefinitionList", [ ( [N("Str", key)], [[ convert(val) ]] )
+                                 for key, val in sorted(c.items()) ])
+
+CONVERTERS = {
+    "MetaMap":               cvt_metamap,
+    "MetaInlines":           cvt_metainlines,
+    "MetaBool":    lambda c: cvt_metainlines([N("Str", str(c).lower())]),
+    "MetaString":  lambda c: cvt_metainlines([N("Str", c)]),
+    "MetaBlocks":  lambda c: N("Div", c, "metavalue"),
+    "MetaList":    lambda c: N("BulletList", [ [convert(item)] for item in c ])
+}
+
+def convert(item):
+    return CONVERTERS[item["t"]](item["c"])
+
+def main():
+    blob = json.load(sys.stdin)
+    metadata = blob[0]['unMeta']
+    rendered = [cvt_metamap(metadata), N("HorizontalRule", [])]
+    rendered.extend(blob[1])
+    blob = [blob[0], rendered]
+    json.dump(blob, sys.stdout, separators=(',',':'))
+
+# This filter script is imported by pandoc_reader in order to learn its
+# actual filename, so don't do anything unless invoked as __main__.
+if __name__ == '__main__': main()
diff --git a/pandoc_reader.py b/pandoc_reader.py
@@ -1,44 +1,167 @@
 import subprocess
+import sys
+
+try:                import xml.etree.cElementTree as ET
+except ImportError: import xml.etree.ElementTree  as ET
+
+try:                from io import StringIO
+except ImportError: from cStringIO import StringIO
+
 from pelican import signals
 from pelican.readers import BaseReader
 from pelican.utils import pelican_open
 
+from . import embed_metadata_filter
+
+def check_command(proc, cmd):
+    """Roughly as subprocess.check_call does, wait for PROC and throw
+       an exception if it didn't exit successfully.  CMD should be the
+       command passed to subprocess.Popen."""
+    status = proc.wait()
+    if status:
+        raise subprocess.CalledProcessError(status, cmd)
+
+def extract_metadata(text):
+    """A filter script converts Pandoc's internal representation of the
+       metadata into an HTML tree structure so that it will make it to
+       the output, with strings properly formatted.  Separate that
+       tree from the HTML for the document itself, and decode it into
+       Pelican's desired representation."""
+
+    def walk_dl(e):
+        rv = {}
+        key = None
+        for child in e:
+            if child.tag == 'dt':
+                assert key is None
+                assert len(child) == 0
+                key = child.text
+            else:
+                assert child.tag == 'dd'
+                assert key is not None
+                assert len(child) == 1
+                rv[key] = walk(child[0])
+                key = None
+        return rv
+
+    def walk_ul(e):
+        rv = []
+        for child in e:
+            assert child.tag == 'li'
+            assert len(child) == 1
+            rv.append(walk(child[0]))
+        return rv
+
+    def walk_value(e):
+        assert e.get('class') == 'metavalue'
+        # Setting e.tag and e.tail to None temporarily seems to be the
+        # least-hassle way to persuade ET.tostring to dump the *contents*
+        # of e but not e itself.
+        tag = e.tag
+        tail = e.tail
+        try:
+            e.tag = None
+            e.tail = None
+            return (ET.tostring(e, encoding="utf-8", method="html")
+                    .decode("utf-8"))
+        finally:
+            e.tag = tag
+            e.tail = tail
+
+    def walk(e):
+        if e.tag == 'dl':
+            return walk_dl(e)
+        elif e.tag == 'ul':
+            return walk_ul(e)
+        elif e.tag == 'div' or e.tag == 'span':
+            return walk_value(e)
+        else:
+            raise RuntimeError("unexpected metadata structure: " +
+                               ET.tostring(e, encoding="utf-8", method="html")
+                               .decode("utf-8"))
+
+
+    metadata, _, document = text.partition("<hr />")
+    document = document.strip()
+
+    # Remove namespaces from all metadata elements while parsing them.
+    # This is necessary because Pandoc thinks you have to put an
+    # xmlns= on every use of <math>, and that makes ET.tostring
+    # generate tags like <ns0:math>, which an HTML (not XHTML) parser
+    # will not understand.
+    it = ET.iterparse(StringIO(metadata))
+    for _, el in it:
+        if '}' in el.tag:
+            el.tag = el.tag.split('}', 1)[1]
+
+    assert it.root.tag == 'dl'
+    return document, walk(it.root)
+
 class PandocReader(BaseReader):
     enabled = True
     file_extensions = ['md', 'markdown', 'mkd', 'mdown']
 
-    def read(self, filename):
-        with pelican_open(filename) as fp:
-            text = list(fp.splitlines())
-
-        metadata = {}
-        for i, line in enumerate(text):
-            kv = line.split(':', 1)
-            if len(kv) == 2:
-                name, value = kv[0].lower(), kv[1].strip()
-                metadata[name] = self.process_metadata(name, value)
-            else:
-                content = "\n".join(text[i:])
-                break
+    def memoize_settings(self):
+        """Load settings and compute the various subprocess invocations we
+           will be using."""
+        if hasattr(self, 'pd_extensions'): return
 
         extra_args = self.settings.get('PANDOC_ARGS', [])
-        extensions = self.settings.get('PANDOC_EXTENSIONS', '')
-        if isinstance(extensions, list):
-            extensions = ''.join(extensions)
+        extensions = set(self.settings.get('PANDOC_EXTENSIONS', []))
+
+        # +mmd_title_block is our default, for compatibility with
+        # older versions of this plugin that parsed Pelican-style
+        # metadata blocks themselves.
+        if ('-mmd_title_block' not in extensions and
+            '+mmd_title_block' not in extensions):
+            extensions.add('+mmd_title_block')
+
+        extensions = ''.join(sorted(extensions))
 
-        pandoc_cmd = ["pandoc", "--from=markdown" + extensions, "--to=html5"]
-        pandoc_cmd.extend(extra_args)
+        pd_cmd_1 = ["pandoc", "-f", "markdown"+extensions, "-t", "json"]
+        pd_cmd_2 = ["pandoc", "-f", "json", "-t", "html5"]
+        # We don't know whether the extra_args are relevant to the reader or
+        # writer, and it is harmless to supply them to both.
+        pd_cmd_1.extend(extra_args)
+        pd_cmd_2.extend(extra_args)
 
-        proc = subprocess.Popen(pandoc_cmd,
-                                stdin = subprocess.PIPE,
-                                stdout = subprocess.PIPE)
+        self.pd_cmd_1 = pd_cmd_1
+        self.pd_cmd_2 = pd_cmd_2
+        self.filt_cmd = [sys.executable, embed_metadata_filter.__file__]
 
-        output = proc.communicate(content.encode('utf-8'))[0].decode('utf-8')
-        status = proc.wait()
-        if status:
-            raise subprocess.CalledProcessError(status, pandoc_cmd)
+    def read(self, filename):
+        self.memoize_settings()
+
+        # We do not use --filter because that requires the filter to
+        # be directly executable.  By constructing a pipeline by hand
+        # we can use sys.executable and not worry about #! lines or
+        # execute bits.
+        PIPE = subprocess.PIPE
+        fp = None
+        p1 = None
+        p2 = None
+        p3 = None
+        try:
+            fp = pelican_open(filename)
+            p1 = subprocess.Popen(self.pd_cmd_1, stdin=fp, stdout=PIPE)
+            p2 = subprocess.Popen(self.filt_cmd, stdin=p1.stdout, stdout=PIPE)
+            p3 = subprocess.Popen(self.pd_cmd_2, stdin=p2.stdout, stdout=PIPE)
+
+            text = p3.stdout.read().decode("utf-8")
+
+        finally:
+            if fp is not None: fp.close()
+            if p1 is not None: check_command(p1)
+            if p2 is not None: check_command(p2)
+            if p3 is not None: check_command(p3)
+
+        document, raw_metadata = extract_metadata(text)
+        metadata = {}
+        for k, v in raw_metadata.items():
+            k = k.lower()
+            metadata[k] = self.process_metadata(k, v)
 
-        return output, metadata
+        return document, metadata
 
 def add_reader(readers):
     for ext in PandocReader.file_extensions: