Skip to content

Commit ce9dc36

Browse files
committed
Allow Pandoc to parse metadata rather than doing it in the plugin.
This involves a fairly complicated dance with a Pandoc "filter" module in order to get all of the metadata to be visible in the output, but means that all metadata formats supported by Pandoc are available without the need for any additional Python modules. It also means strings in metadata will be processed as Markdown.
1 parent 9ef0197 commit ce9dc36

File tree

3 files changed

+202
-32
lines changed

3 files changed

+202
-32
lines changed

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,23 @@
11
pandoc_reader
22
=============
33

4-
A pandoc [markdown] reader plugin for [pelican]
4+
A pandoc [markdown][] reader plugin for [pelican][]
55

66

77
Requirements
88
------------
99

10-
- [pandoc] in $PATH
11-
10+
- [pandoc][] in `$PATH`
1211

1312
Installation
1413
------------
1514

1615
Instructions for installation of pelican plugins can be obtained from the [pelican plugin manual](https://github.com/getpelican/pelican-plugins/blob/master/Readme.rst).
1716

18-
1917
Configuration
2018
-------------
2119

22-
Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter.
20+
Additional command line parameters can be passed to pandoc via the `PANDOC_ARGS` parameter.
2321

2422
PANDOC_ARGS = [
2523
'--mathjax',
@@ -30,13 +28,15 @@ Additional command line parameters can be passed to pandoc via the PANDOC_ARGS p
3028
]
3129

3230
Pandoc's markdown extensions can be enabled or disabled via the
33-
PANDOC_EXTENSIONS parameter.
31+
`PANDOC_EXTENSIONS` parameter.
3432

3533
PANDOC_EXTENSIONS = [
3634
'+hard_line_breaks',
3735
'-citations'
3836
]
3937

38+
In addition to Pandoc's own default set of extensions, this plugin enables the `mmd_title_block` extension by default. It can be disabled with `'-mmd_title_block'` as usual.
39+
4040
Contributing
4141
------------
4242

embed_metadata_filter.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# This is a filter script which embeds all of the metadata parsed by
2+
# Pandoc into the HTML output, where the main body of the reader can
3+
# pick it up. In order to preserve Pandoc's translation of Markdown
4+
# in metadata values, we convert the metadata structure into an HTML
5+
# tree structure. A <hr> separates the translated metadata from the
6+
# document itself.
7+
#
8+
# See http://johnmacfarlane.net/pandoc/scripting.html for documentation
9+
# of the JSON-serialized AST that we are manipulating.
10+
11+
import json
12+
import sys
13+
14+
def N(t, c, cls=None):
15+
if cls is not None: c = [ ["", [cls], []], c ]
16+
return { "t": t, "c": c }
17+
18+
def cvt_metainlines(c):
19+
return N("Plain", [N("Span", c, "metavalue")])
20+
21+
def cvt_metamap(c):
22+
return N("DefinitionList", [ ( [N("Str", key)], [[ convert(val) ]] )
23+
for key, val in sorted(c.items()) ])
24+
25+
CONVERTERS = {
26+
"MetaMap": cvt_metamap,
27+
"MetaInlines": cvt_metainlines,
28+
"MetaBool": lambda c: cvt_metainlines([N("Str", str(c).lower())]),
29+
"MetaString": lambda c: cvt_metainlines([N("Str", c)]),
30+
"MetaBlocks": lambda c: N("Div", c, "metavalue"),
31+
"MetaList": lambda c: N("BulletList", [ [convert(item)] for item in c ])
32+
}
33+
34+
def convert(item):
35+
return CONVERTERS[item["t"]](item["c"])
36+
37+
def main():
38+
blob = json.load(sys.stdin)
39+
metadata = blob[0]['unMeta']
40+
rendered = [cvt_metamap(metadata), N("HorizontalRule", [])]
41+
rendered.extend(blob[1])
42+
blob = [blob[0], rendered]
43+
json.dump(blob, sys.stdout, separators=(',',':'))
44+
45+
# This filter script is imported by pandoc_reader in order to learn its
46+
# actual filename, so don't do anything unless invoked as __main__.
47+
if __name__ == '__main__': main()

pandoc_reader.py

Lines changed: 149 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,167 @@
11
import subprocess
2+
import sys
3+
4+
try: import xml.etree.cElementTree as ET
5+
except ImportError: import xml.etree.ElementTree as ET
6+
7+
try: from io import StringIO
8+
except ImportError: from cStringIO import StringIO
9+
210
from pelican import signals
311
from pelican.readers import BaseReader
412
from pelican.utils import pelican_open
513

14+
from . import embed_metadata_filter
15+
16+
def check_command(proc, cmd):
17+
"""Roughly as subprocess.check_call does, wait for PROC and throw
18+
an exception if it didn't exit successfully. CMD should be the
19+
command passed to subprocess.Popen."""
20+
status = proc.wait()
21+
if status:
22+
raise subprocess.CalledProcessError(status, cmd)
23+
24+
def extract_metadata(text):
25+
"""A filter script converts Pandoc's internal representation of the
26+
metadata into an HTML tree structure so that it will make it to
27+
the output, with strings properly formatted. Separate that
28+
tree from the HTML for the document itself, and decode it into
29+
Pelican's desired representation."""
30+
31+
def walk_dl(e):
32+
rv = {}
33+
key = None
34+
for child in e:
35+
if child.tag == 'dt':
36+
assert key is None
37+
assert len(child) == 0
38+
key = child.text
39+
else:
40+
assert child.tag == 'dd'
41+
assert key is not None
42+
assert len(child) == 1
43+
rv[key] = walk(child[0])
44+
key = None
45+
return rv
46+
47+
def walk_ul(e):
48+
rv = []
49+
for child in e:
50+
assert child.tag == 'li'
51+
assert len(child) == 1
52+
rv.append(walk(child[0]))
53+
return rv
54+
55+
def walk_value(e):
56+
assert e.get('class') == 'metavalue'
57+
# Setting e.tag and e.tail to None temporarily seems to be the
58+
# least-hassle way to persuade ET.tostring to dump the *contents*
59+
# of e but not e itself.
60+
tag = e.tag
61+
tail = e.tail
62+
try:
63+
e.tag = None
64+
e.tail = None
65+
return (ET.tostring(e, encoding="utf-8", method="html")
66+
.decode("utf-8"))
67+
finally:
68+
e.tag = tag
69+
e.tail = tail
70+
71+
def walk(e):
72+
if e.tag == 'dl':
73+
return walk_dl(e)
74+
elif e.tag == 'ul':
75+
return walk_ul(e)
76+
elif e.tag == 'div' or e.tag == 'span':
77+
return walk_value(e)
78+
else:
79+
raise RuntimeError("unexpected metadata structure: " +
80+
ET.tostring(e, encoding="utf-8", method="html")
81+
.decode("utf-8"))
82+
83+
84+
metadata, _, document = text.partition("<hr />")
85+
document = document.strip()
86+
87+
# Remove namespaces from all metadata elements while parsing them.
88+
# This is necessary because Pandoc thinks you have to put an
89+
# xmlns= on every use of <math>, and that makes ET.tostring
90+
# generate tags like <ns0:math>, which an HTML (not XHTML) parser
91+
# will not understand.
92+
it = ET.iterparse(StringIO(metadata))
93+
for _, el in it:
94+
if '}' in el.tag:
95+
el.tag = el.tag.split('}', 1)[1]
96+
97+
assert it.root.tag == 'dl'
98+
return document, walk(it.root)
99+
6100
class PandocReader(BaseReader):
7101
enabled = True
8102
file_extensions = ['md', 'markdown', 'mkd', 'mdown']
9103

10-
def read(self, filename):
11-
with pelican_open(filename) as fp:
12-
text = list(fp.splitlines())
13-
14-
metadata = {}
15-
for i, line in enumerate(text):
16-
kv = line.split(':', 1)
17-
if len(kv) == 2:
18-
name, value = kv[0].lower(), kv[1].strip()
19-
metadata[name] = self.process_metadata(name, value)
20-
else:
21-
content = "\n".join(text[i:])
22-
break
104+
def memoize_settings(self):
105+
"""Load settings and compute the various subprocess invocations we
106+
will be using."""
107+
if hasattr(self, 'pd_extensions'): return
23108

24109
extra_args = self.settings.get('PANDOC_ARGS', [])
25-
extensions = self.settings.get('PANDOC_EXTENSIONS', '')
26-
if isinstance(extensions, list):
27-
extensions = ''.join(extensions)
110+
extensions = set(self.settings.get('PANDOC_EXTENSIONS', []))
111+
112+
# +mmd_title_block is our default, for compatibility with
113+
# older versions of this plugin that parsed Pelican-style
114+
# metadata blocks themselves.
115+
if ('-mmd_title_block' not in extensions and
116+
'+mmd_title_block' not in extensions):
117+
extensions.add('+mmd_title_block')
118+
119+
extensions = ''.join(sorted(extensions))
28120

29-
pandoc_cmd = ["pandoc", "--from=markdown" + extensions, "--to=html5"]
30-
pandoc_cmd.extend(extra_args)
121+
pd_cmd_1 = ["pandoc", "-f", "markdown"+extensions, "-t", "json"]
122+
pd_cmd_2 = ["pandoc", "-f", "json", "-t", "html5"]
123+
# We don't know whether the extra_args are relevant to the reader or
124+
# writer, and it is harmless to supply them to both.
125+
pd_cmd_1.extend(extra_args)
126+
pd_cmd_2.extend(extra_args)
31127

32-
proc = subprocess.Popen(pandoc_cmd,
33-
stdin = subprocess.PIPE,
34-
stdout = subprocess.PIPE)
128+
self.pd_cmd_1 = pd_cmd_1
129+
self.pd_cmd_2 = pd_cmd_2
130+
self.filt_cmd = [sys.executable, embed_metadata_filter.__file__]
35131

36-
output = proc.communicate(content.encode('utf-8'))[0].decode('utf-8')
37-
status = proc.wait()
38-
if status:
39-
raise subprocess.CalledProcessError(status, pandoc_cmd)
132+
def read(self, filename):
133+
self.memoize_settings()
134+
135+
# We do not use --filter because that requires the filter to
136+
# be directly executable. By constructing a pipeline by hand
137+
# we can use sys.executable and not worry about #! lines or
138+
# execute bits.
139+
PIPE = subprocess.PIPE
140+
fp = None
141+
p1 = None
142+
p2 = None
143+
p3 = None
144+
try:
145+
fp = pelican_open(filename)
146+
p1 = subprocess.Popen(self.pd_cmd_1, stdin=fp, stdout=PIPE)
147+
p2 = subprocess.Popen(self.filt_cmd, stdin=p1.stdout, stdout=PIPE)
148+
p3 = subprocess.Popen(self.pd_cmd_2, stdin=p2.stdout, stdout=PIPE)
149+
150+
text = p3.stdout.read().decode("utf-8")
151+
152+
finally:
153+
if fp is not None: fp.close()
154+
if p1 is not None: check_command(p1)
155+
if p2 is not None: check_command(p2)
156+
if p3 is not None: check_command(p3)
157+
158+
document, raw_metadata = extract_metadata(text)
159+
metadata = {}
160+
for k, v in raw_metadata.items():
161+
k = k.lower()
162+
metadata[k] = self.process_metadata(k, v)
40163

41-
return output, metadata
164+
return document, metadata
42165

43166
def add_reader(readers):
44167
for ext in PandocReader.file_extensions:

0 commit comments

Comments
 (0)