liob · zackw · Mar 22, 2015
diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
 pandoc_reader
 =============
 
-A pandoc [markdown] reader plugin for [pelican]
+A pandoc [markdown][] reader plugin for [pelican][]
 
 
 Requirements
 ------------
 
-  - [pandoc] in $PATH
-
+  - [pandoc][] in `$PATH`
+  - [PyYAML][], only if YAML-format metadata is used.
 
 Installation
 ------------
@@ -19,7 +19,7 @@ Instructions for installation of pelican plugins can be obtained from the [pelic
 Configuration
 -------------
 
-Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter.
+Additional command line parameters can be passed to pandoc via the `PANDOC_ARGS` parameter.
 
     PANDOC_ARGS = [
       '--mathjax',
@@ -30,13 +30,46 @@ Additional command line parameters can be passed to pandoc via the PANDOC_ARGS p
     ]
 
 Pandoc's markdown extensions can be enabled or disabled via the
-PANDOC_EXTENSIONS parameter.
+`PANDOC_EXTENSIONS` parameter.
 
     PANDOC_EXTENSIONS = [
       '+hard_line_breaks',
       '-citations'
     ]
 
+Hard tabs in the file are expanded to spaces before the file is passed
+to Pandoc.  The tab width can be set with the `PANDOC_TAB_WIDTH`
+parameter; the default is 8.
+
+File Metadata
+-------------
+
+By default, metadata conforms to the same syntax understood by Python
+Markdown's [meta-data extension][], which is not unlike that used for
+email headers.  It's easiest to give an example:
+
+    Title:   My Document
+    Summary: A brief description of my document.
+    Authors: Waylan Limberg
+             John Doe
+    Date:    October 2, 2007
+    blank-value:
+    base_url: http://example.com
+
+    This is the first paragraph of the document.
+
+If the first line of the document is exactly '`---`' (three dashes),
+then the metadata instead ends at the next line which is either
+exactly '`---`' or exactly '`...`', and everything in between will be
+parsed as [YAML][], using the [PyYAML][] library.  Note that Python
+Markdown also recognizes `---` to `...` as metadata delimiters but
+does *not* parse what's in between as YAML.
+
+In either syntax, all top-level metadata keys are folded to lowercase
+(as expected by Pelican core).  The metadata does *not* pass through
+Pandoc; this means, for instance, that Markdown notation within a
+metadata value will not be processed.
+
 Contributing
 ------------
 
@@ -50,3 +83,6 @@ Contributing
 [markdown]: http://daringfireball.net/projects/markdown/
 [pandoc]: http://johnmacfarlane.net/pandoc/
 [pelican]: http://getpelican.com
+[YAML]: http://yaml.org/
+[PyYAML]: http://pyyaml.org/
+[meta-data extension]: https://pythonhosted.org//Markdown/extensions/meta_data.html
diff --git a/pandoc_reader.py b/pandoc_reader.py
@@ -1,25 +1,97 @@
+import re
 import subprocess
 from pelican import signals
 from pelican.readers import BaseReader
 from pelican.utils import pelican_open
 
+# The syntax of "plain" metadata is aligned with markdown.extensions.meta.
+META_RE = re.compile(r'^[ ]{0,3}(?P<key>[A-Za-z0-9_-]+):\s*(?P<value>.*)')
+META_MORE_RE = re.compile(r'^[ ]{4,}(?P<value>.*)')
+END_RE = re.compile(r'^\s*$')
+
+def parse_plain_metadata(lines):
+    meta = {}
+    if not lines:
+        return meta
+
+    lines.reverse()
+    while lines:
+        line = lines.pop()
+        if END_RE.match(line):
+            break
+        m = META_RE.match(line)
+        if m:
+            key = m.group('key').lower().strip()
+            value = m.group('value').strip()
+            if key in meta:
+                meta[key].append(value)
+            else:
+                meta[key] = [value]
+        else:
+            m2 = META_MORE_RE.match(line)
+            if m2 and key:
+                meta[key].append(m2.group('value').strip())
+            else:
+                lines.append(line)
+                break
+
+    lines.reverse()
+    # Flatten all 1-entry lists.
+    for k in list(meta.keys()):
+        v = meta[k]
+        if len(v) == 0:
+            del meta[k]
+        elif len(v) == 1:
+            meta[k] = v[0]
+
+    return meta
+
+# Load PyYaml only if required.
+yaml_load = None
+def get_yaml_load():
+    global yaml_load
+    if yaml_load is None:
+        import yaml
+        try:
+            from yaml import CSafeLoader as SafeLoader
+        except ImportError:
+            from yaml import SafeLoader
+
+        def _yaml_load(block):
+            return yaml.load("\n".join(block), SafeLoader)
+        yaml_load = _yaml_load
+    return yaml_load
+
+def parse_yaml_metadata(lines):
+    for i, l in enumerate(lines):
+        if i > 0 and (lines[i] == '---' or lines[i] == '...'):
+            yblock = lines[1:i]
+            del lines[0:(i+1)]
+            break
+    else:
+        return {}
+
+    meta = get_yaml_load()(yblock)
+    return { k.lower(): v for k, v in meta.items() }
+
+
 class PandocReader(BaseReader):
     enabled = True
     file_extensions = ['md', 'markdown', 'mkd', 'mdown']
 
     def read(self, filename):
+        tab_width = self.settings.get('PANDOC_TAB_WIDTH', 8)
         with pelican_open(filename) as fp:
-            text = list(fp.splitlines())
-
-        metadata = {}
-        for i, line in enumerate(text):
-            kv = line.split(':', 1)
-            if len(kv) == 2:
-                name, value = kv[0].lower(), kv[1].strip()
-                metadata[name] = self.process_metadata(name, value)
-            else:
-                content = "\n".join(text[i:])
-                break
+            text = [line.expandtabs(tab_width) for line in fp.splitlines()]
+
+        if text[0] == '---':
+            metadata = parse_yaml_metadata(text)
+        else:
+            metadata = parse_plain_metadata(text)
+
+        metadata = { k: self.process_metadata(k, v)
+                     for k, v in metadata.items() }
+        content = "\n".join(text)
 
         extra_args = self.settings.get('PANDOC_ARGS', [])
         extensions = self.settings.get('PANDOC_EXTENSIONS', '')