Skip to content

Commit ccffc27

Browse files
committed
Allow Pandoc to parse metadata rather than doing it in the plugin.
This involves a fairly complicated dance with a Pandoc "filter" module in order to get all of the metadata to be visible in the output, but means that all metadata formats supported by Pandoc are available without the need for any additional Python modules. It also means strings in metadata will be processed as Markdown. NOTE: Thanks to jgm/pandoc#2026 and backward compatibility constraints, this change defaults to enabling 'mmd_title_block' and *disabling* 'pandoc_title_block' and 'yaml_metadata_block'. Moreover, putting either +pandoc_title_block or +yaml_metadata_block in PANDOC_EXTENSIONS will cause mmd_title_block to be disabled.
1 parent 9ef0197 commit ccffc27

File tree

3 files changed

+249
-35
lines changed

3 files changed

+249
-35
lines changed

README.md

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,23 @@
11
pandoc_reader
22
=============
33

4-
A pandoc [markdown] reader plugin for [pelican]
4+
A pandoc [markdown][] reader plugin for [pelican][]
55

66

77
Requirements
88
------------
99

10-
- [pandoc] in $PATH
11-
10+
- [pandoc][] in `$PATH`
1211

1312
Installation
1413
------------
1514

1615
Instructions for installation of pelican plugins can be obtained from the [pelican plugin manual](https://github.com/getpelican/pelican-plugins/blob/master/Readme.rst).
1716

18-
1917
Configuration
2018
-------------
2119

22-
Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter.
20+
Additional command line parameters can be passed to pandoc via the `PANDOC_ARGS` parameter.
2321

2422
PANDOC_ARGS = [
2523
'--mathjax',
@@ -29,14 +27,19 @@ Additional command line parameters can be passed to pandoc via the PANDOC_ARGS p
2927
'--number-sections',
3028
]
3129

32-
Pandoc's markdown extensions can be enabled or disabled via the
33-
PANDOC_EXTENSIONS parameter.
30+
Pandoc's syntactic extensions to Markdown can be enabled or disabled via the
31+
`PANDOC_EXTENSIONS` parameter.
3432

3533
PANDOC_EXTENSIONS = [
3634
'+hard_line_breaks',
3735
'-citations'
3836
]
3937

38+
File Metadata
39+
-------------
40+
41+
For compatibility with older versions of this plugin that parsed MultiMarkdown-like title blocks internally, the [`mmd_title_block`][mmd_title_block] syntax extension is enabled by default. Unfortunately, this causes Pandoc to misinterpret YAML metadata and possibly also native title blocks (see [Pandoc issue 2026][]). Therefore, those metadata formats are *disabled* by default. To revert to Pandoc's default behavior (accepting native title blocks and YAML metadata, but not MMD title blocks), include `-mmd_title_block` in `PANDOC_EXTENSIONS`.
42+
4043
Contributing
4144
------------
4245

@@ -50,3 +53,5 @@ Contributing
5053
[markdown]: http://daringfireball.net/projects/markdown/
5154
[pandoc]: http://johnmacfarlane.net/pandoc/
5255
[pelican]: http://getpelican.com
56+
[mmd_title_block]: http://johnmacfarlane.net/pandoc/README.html#extension-mmd_title_block
57+
[Pandoc issue 2026]: https://github.com/jgm/pandoc/issues/2026

embed_metadata_filter.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# This is a filter script which embeds all of the metadata parsed by
2+
# Pandoc into the HTML output, where the main body of the reader can
3+
# pick it up. In order to preserve Pandoc's translation of Markdown
4+
# in metadata values, we convert the metadata structure into an HTML
5+
# tree structure. A <hr> separates the translated metadata from the
6+
# document itself.
7+
#
8+
# See http://johnmacfarlane.net/pandoc/scripting.html for documentation
9+
# of the JSON-serialized AST that we are manipulating.
10+
11+
import json
12+
import sys
13+
14+
def N(t, c, cls=None):
15+
if cls is not None: c = [ ["", [cls], []], c ]
16+
return { "t": t, "c": c }
17+
18+
def cvt_metainlines(c):
19+
return N("Plain", [N("Span", c, "metavalue")])
20+
21+
def cvt_metamap(c):
22+
return N("DefinitionList", [ ( [N("Str", key)], [[ convert(val) ]] )
23+
for key, val in sorted(c.items()) ])
24+
25+
CONVERTERS = {
26+
"MetaMap": cvt_metamap,
27+
"MetaInlines": cvt_metainlines,
28+
"MetaBool": lambda c: cvt_metainlines([N("Str", str(c).lower())]),
29+
"MetaString": lambda c: cvt_metainlines([N("Str", c)]),
30+
"MetaBlocks": lambda c: N("Div", c, "metavalue"),
31+
"MetaList": lambda c: N("BulletList", [ [convert(item)] for item in c ])
32+
}
33+
34+
def convert(item):
35+
return CONVERTERS[item["t"]](item["c"])
36+
37+
def main():
38+
blob = json.load(sys.stdin)
39+
metadata = blob[0]['unMeta']
40+
rendered = [cvt_metamap(metadata), N("HorizontalRule", [])]
41+
rendered.extend(blob[1])
42+
blob = [blob[0], rendered]
43+
json.dump(blob, sys.stdout, separators=(',',':'))
44+
45+
# This filter script is imported by pandoc_reader in order to learn its
46+
# actual filename, so don't do anything unless invoked as __main__.
47+
if __name__ == '__main__': main()

pandoc_reader.py

Lines changed: 190 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,206 @@
11
import subprocess
2+
import sys
3+
4+
import logging
5+
logger = logging.getLogger(__name__)
6+
7+
try: import xml.etree.cElementTree as ET
8+
except ImportError: import xml.etree.ElementTree as ET
9+
10+
try: from io import StringIO
11+
except ImportError: from cStringIO import StringIO
12+
213
from pelican import signals
314
from pelican.readers import BaseReader
4-
from pelican.utils import pelican_open
15+
16+
from . import embed_metadata_filter
17+
18+
def check_command(proc, cmd):
19+
"""Roughly as subprocess.check_call does, wait for PROC and throw
20+
an exception if it didn't exit successfully. CMD should be the
21+
command passed to subprocess.Popen."""
22+
status = proc.wait()
23+
if status:
24+
raise subprocess.CalledProcessError(status, cmd)
25+
26+
def extract_metadata(text):
27+
"""A filter script converts Pandoc's internal representation of the
28+
metadata into an HTML tree structure so that it will make it to
29+
the output, with strings properly formatted. Separate that
30+
tree from the HTML for the document itself, and decode it into
31+
Pelican's desired representation."""
32+
33+
def walk_dl(e):
34+
rv = {}
35+
key = None
36+
for child in e:
37+
if child.tag == "dt":
38+
assert key is None
39+
assert len(child) == 0
40+
key = child.text
41+
else:
42+
assert child.tag == "dd"
43+
assert key is not None
44+
assert len(child) == 1
45+
rv[key] = walk(child[0])
46+
key = None
47+
return rv
48+
49+
def walk_ul(e):
50+
rv = []
51+
for child in e:
52+
assert child.tag == "li"
53+
assert len(child) == 1
54+
rv.append(walk(child[0]))
55+
return rv
56+
57+
def walk_value(e):
58+
assert e.get("class") == "metavalue"
59+
# Setting e.tag and e.tail to None temporarily seems to be the
60+
# least-hassle way to persuade ET.tostring to dump the *contents*
61+
# of e but not e itself.
62+
tag = e.tag
63+
tail = e.tail
64+
try:
65+
e.tag = None
66+
e.tail = None
67+
return (ET.tostring(e, encoding="utf-8", method="html")
68+
.decode("utf-8").strip())
69+
finally:
70+
e.tag = tag
71+
e.tail = tail
72+
73+
def walk(e):
74+
if e.tag == "dl":
75+
return walk_dl(e)
76+
elif e.tag == "ul":
77+
return walk_ul(e)
78+
elif e.tag == "div" or e.tag == "span":
79+
return walk_value(e)
80+
else:
81+
logger.error("unexpected metadata structure: " +
82+
ET.tostring(e, encoding="utf-8", method="html")
83+
.decode("utf-8"))
84+
85+
86+
metadata, _, document = text.partition("<hr />")
87+
document = document.strip()
88+
89+
# Remove namespaces from all metadata elements while parsing them.
90+
# This is necessary because Pandoc thinks you have to put an
91+
# xmlns= on every use of <math>, and that makes ET.tostring
92+
# generate tags like <ns0:math>, which an HTML (not XHTML) parser
93+
# will not understand.
94+
it = ET.iterparse(StringIO(metadata))
95+
for _, el in it:
96+
if "}" in el.tag:
97+
el.tag = el.tag.split("}", 1)[1]
98+
99+
assert it.root.tag == "dl"
100+
return document, walk(it.root)
5101

6102
class PandocReader(BaseReader):
7103
enabled = True
8-
file_extensions = ['md', 'markdown', 'mkd', 'mdown']
104+
file_extensions = ["md", "markdown", "mkd", "mdown"]
9105

10-
def read(self, filename):
11-
with pelican_open(filename) as fp:
12-
text = list(fp.splitlines())
106+
def memoize_settings(self):
107+
"""Load settings and compute the various subprocess invocations we
108+
will be using."""
109+
if hasattr(self, "pd_extensions"): return
13110

14-
metadata = {}
15-
for i, line in enumerate(text):
16-
kv = line.split(':', 1)
17-
if len(kv) == 2:
18-
name, value = kv[0].lower(), kv[1].strip()
19-
metadata[name] = self.process_metadata(name, value)
20-
else:
21-
content = "\n".join(text[i:])
22-
break
111+
extra_args = self.settings.get("PANDOC_ARGS", [])
112+
113+
pos_extensions = set()
114+
neg_extensions = set()
115+
for ext in self.settings.get("PANDOC_EXTENSIONS", []):
116+
if len(ext) >= 2:
117+
if ext[0] == "-":
118+
neg_extensions.add(ext[1:])
119+
continue
120+
elif ext[0] == "+":
121+
pos_extensions.add(ext[1:])
122+
continue
123+
logger.error("invalid PANDOC_EXTENSIONS item {!r}".format(ext))
124+
125+
# For compatibility with older versions of this plugin that
126+
# parsed vaguely MMD-style metadata blocks themselves, we
127+
# default to +mmd_title_block. Unfortunately,
128+
# +mmd_title_block causes Pandoc to mis-parse YAML and
129+
# possibly also native title blocks (see
130+
# https://github.com/jgm/pandoc/issues/2026). Therefore,
131+
# if there's nothing about title blocks in PANDOC_EXTENSIONS,
132+
# we also explicitly disable YAML and native title blocks.
133+
134+
if ("mmd_title_block" not in pos_extensions and
135+
"mmd_title_block" not in neg_extensions and
136+
"pandoc_title_block" not in pos_extensions and
137+
"pandoc_title_block" not in neg_extensions and
138+
"yaml_metadata_block" not in pos_extensions and
139+
"yaml_metadata_block" not in neg_extensions):
140+
pos_extensions.add("mmd_title_block")
141+
neg_extensions.add("pandoc_title_block")
142+
neg_extensions.add("yaml_metadata_block")
23143

24-
extra_args = self.settings.get('PANDOC_ARGS', [])
25-
extensions = self.settings.get('PANDOC_EXTENSIONS', '')
26-
if isinstance(extensions, list):
27-
extensions = ''.join(extensions)
144+
both_exts = pos_extensions & neg_extensions
145+
if both_exts:
146+
logger.error("Pandoc syntax extensions both enabled and disabled: "
147+
+ " ".join(sorted(both_exts)))
148+
pos_extensions -= both_exts
149+
neg_extensions -= both_exts
28150

29-
pandoc_cmd = ["pandoc", "--from=markdown" + extensions, "--to=html5"]
30-
pandoc_cmd.extend(extra_args)
151+
syntax = "markdown"
152+
if pos_extensions:
153+
syntax += "".join(sorted("+"+ext for ext in pos_extensions))
154+
if neg_extensions:
155+
syntax += "".join(sorted("-"+ext for ext in neg_extensions))
31156

32-
proc = subprocess.Popen(pandoc_cmd,
33-
stdin = subprocess.PIPE,
34-
stdout = subprocess.PIPE)
157+
pd_cmd_1 = ["pandoc", "-f", syntax, "-t", "json"]
158+
pd_cmd_2 = ["pandoc", "-f", "json", "-t", "html5"]
159+
# We don't know whether the extra_args are relevant to the reader or
160+
# writer, and it is harmless to supply them to both.
161+
pd_cmd_1.extend(extra_args)
162+
pd_cmd_2.extend(extra_args)
35163

36-
output = proc.communicate(content.encode('utf-8'))[0].decode('utf-8')
37-
status = proc.wait()
38-
if status:
39-
raise subprocess.CalledProcessError(status, pandoc_cmd)
164+
self.pd_cmd_1 = pd_cmd_1
165+
self.pd_cmd_2 = pd_cmd_2
166+
self.filt_cmd = [sys.executable, embed_metadata_filter.__file__]
167+
logger.debug("Reader command: " + " ".join(self.pd_cmd_1))
168+
logger.debug("Writer command: " + " ".join(self.pd_cmd_2))
169+
logger.debug("Filter command: " + " ".join(self.filt_cmd))
170+
171+
def read(self, filename):
172+
self.memoize_settings()
173+
174+
# We do not use --filter because that requires the filter to
175+
# be directly executable. By constructing a pipeline by hand
176+
# we can use sys.executable and not worry about #! lines or
177+
# execute bits.
178+
PIPE = subprocess.PIPE
179+
fp = None
180+
p1 = None
181+
p2 = None
182+
p3 = None
183+
try:
184+
fp = open(filename, "rb")
185+
p1 = subprocess.Popen(self.pd_cmd_1, stdin=fp, stdout=PIPE)
186+
p2 = subprocess.Popen(self.filt_cmd, stdin=p1.stdout, stdout=PIPE)
187+
p3 = subprocess.Popen(self.pd_cmd_2, stdin=p2.stdout, stdout=PIPE)
188+
189+
text = p3.stdout.read().decode("utf-8")
190+
191+
finally:
192+
if fp is not None: fp.close()
193+
if p1 is not None: check_command(p1, self.pd_cmd_1)
194+
if p2 is not None: check_command(p2, self.filt_cmd)
195+
if p3 is not None: check_command(p3, self.pd_cmd_2)
196+
197+
document, raw_metadata = extract_metadata(text)
198+
metadata = {}
199+
for k, v in raw_metadata.items():
200+
k = k.lower()
201+
metadata[k] = self.process_metadata(k, v)
40202

41-
return output, metadata
203+
return document, metadata
42204

43205
def add_reader(readers):
44206
for ext in PandocReader.file_extensions:

0 commit comments

Comments
 (0)