From 330f159e814331a532b1d29358e5556fdee67fde Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 6 Feb 2025 14:39:17 -0800 Subject: [PATCH] mediawiki reader: improve strong/emph conformance --- pandoc.cabal | 1 + src/Text/Pandoc/Readers/MediaWiki.hs | 28 +++-- test/Tests/Readers/MediaWiki.hs | 166 +++++++++++++++++++++++++++ test/mediawiki-reader.native | 6 +- test/test-pandoc.hs | 2 + 5 files changed, 193 insertions(+), 10 deletions(-) create mode 100644 test/Tests/Readers/MediaWiki.hs diff --git a/pandoc.cabal b/pandoc.cabal index b064658d0b4b..25accd78f43c 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -845,6 +845,7 @@ test-suite test-pandoc Tests.Readers.FB2 Tests.Readers.Pod Tests.Readers.DokuWiki + Tests.Readers.MediaWiki Tests.Writers.Native Tests.Writers.ConTeXt Tests.Writers.DocBook diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 1112007c6bce..16ab55ec1b57 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -40,6 +40,7 @@ import Text.Pandoc.Shared (safeRead, stringify, stripTrailingNewlines, trim, splitTextBy, tshow, formatCode) import Text.Pandoc.Char (isCJK) import Text.Pandoc.XML (fromEntities) +import Data.Functor (($>)) -- | Read mediawiki from an input string and return a Pandoc document. readMediaWiki :: (PandocMonad m, ToSources a) @@ -531,8 +532,8 @@ inline = whitespace <|> url <|> str <|> doubleQuotes - <|> strong <|> emph + <|> strong <|> image <|> internalLink <|> externalLink @@ -699,14 +700,27 @@ inlinesBetween start end = trimInlines . mconcat <$> try (start >> many1Till inline end) emph :: PandocMonad m => MWParser m Inlines -emph = B.emph <$> inlinesBetween start end - where start = sym "''" - end = try $ notFollowedBy' (() <$ strong) >> sym "''" +emph = B.emph . trimInlines . mconcat <$> try (start >> rest) +-- emph = B.emph <$> inlinesBetween start end + where start = sym "''" >> (lookAhead (void strong) <|> notFollowedBy (many1 (char '\''))) + rest = do + (ins, quots) <- manyUntil inline end + pure $ ins ++ [quots] + end = try $ notBold >> end'' + notBold = notFollowedBy' (void strong) + -- end' = sym "''" >> notFollowedBy (char '\'' >> noneOf "'") + end'' = try (sym "''''" >> notFollowedBy (char '\'') $> B.str "''") + <|> try (sym "'''" >> notFollowedBy (char '\'') $> B.str "'") + <|> (sym "''" $> mempty) strong :: PandocMonad m => MWParser m Inlines -strong = B.strong <$> inlinesBetween start end - where start = sym "'''" - end = sym "'''" +strong = B.strong . trimInlines . mconcat <$> try (start >> rest) + where start = sym "'''" >> (lookAhead (void emph) <|> notFollowedBy (many1 (char '\''))) + rest = do + (ins, quots) <- manyUntil inline end + pure $ ins ++ [quots] + end = try (sym "''''" >> (lookAhead (sym "''") <|> notFollowedBy (char '\'')) $> B.str "'") + <|> (sym "'''" $> mempty) doubleQuotes :: PandocMonad m => MWParser m Inlines doubleQuotes = do diff --git a/test/Tests/Readers/MediaWiki.hs b/test/Tests/Readers/MediaWiki.hs new file mode 100644 index 000000000000..f92d38cddae7 --- /dev/null +++ b/test/Tests/Readers/MediaWiki.hs @@ -0,0 +1,166 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Tests.Readers.MediaWiki + Copyright : © 2025 Evan Silberman + License : GNU GPL, version 2 or above + + Maintainer : + Stability : alpha + Portability : portable + +Tests for the MediaWiki reader. +-} + +module Tests.Readers.MediaWiki (tests) where + +import Data.Text (Text) +import qualified Data.Text as T +import Test.Tasty +import Test.Tasty.HUnit (HasCallStack) +import Tests.Helpers +import Text.Pandoc +import Text.Pandoc.Arbitrary () +import Text.Pandoc.Builder + +mw :: Text -> Pandoc +mw = purely $ readMediaWiki def + +wikilink :: Text -> Inlines +wikilink dest = linkWith ("", ["wikilink"], []) (T.replace " " "_" dest) dest (text dest) + +infix 4 =: +(=:) :: (ToString c, HasCallStack) + => String -> (Text, c) -> TestTree +(=:) = test mw + +tests :: [TestTree] +tests = [ + -- The "quotes" tests are adapted from tests for parsoid, MediaWiki's current + -- wikitext parser. Cf. https://gerrit.wikimedia.org/r/plugins/gitiles/mediawiki/services/parsoid/+/refs/heads/master/tests/parser/quotes.txt + testGroup "quotes" + [ testGroup "intraword emphasis" + [ "italic" =: + "plain''italic''plain" =?> + para ("plain" <> emph "italic" <> "plain") + , "two italics" =: + "plain''italic''plain''italic''plain" =?> + para ("plain" <> emph "italic" <> "plain" <> emph "italic" <> "plain") + , "bold" =: + "plain'''bold'''plain" =?> + para ("plain" <> strong "bold" <> "plain") + , "two bolds" =: + "plain'''bold'''plain'''bold'''plain" =?> + para ("plain" <> strong "bold" <> "plain" <> strong "bold" <> "plain") + , "bold and italic" =: + "plain'''bold'''plain''italic''plain" =?> + para ("plain" <> strong "bold" <> "plain" <> emph "italic" <> "plain") + , "italic and bold" =: + "plain''italic''plain'''bold'''plain" =?> + para ("plain" <> emph "italic" <> "plain" <> strong "bold" <> "plain") + , "italic with bold-italic" =: + "plain''italic'''bold-italic'''italic''plain" =?> + para ("plain" <> emph ("italic" <> strong "bold-italic" <> "italic") <> "plain") + , "bold with bold-italic" =: + "plain'''bold''bold-italic''bold'''plain" =?> + para ("plain" <> strong ("bold" <> emph "bold-italic" <> "bold") <> "plain") + , "bold-italic then italic" =: + "plain'''''bold-italic'''italic''plain" =?> + para ("plain" <> emph (strong "bold-italic" <> "italic") <> "plain") + , "bold-italic then bold" =: + "plain'''''bold-italic''bold'''plain" =?> + para ("plain" <> strong (emph "bold-italic" <> "bold") <> "plain") + , "italic then bold-italic" =: + "plain''italic'''bold-italic'''''plain" =?> + para ("plain" <> emph ("italic" <> strong "bold-italic") <> "plain") + , "bold then bold-italic" =: + "plain'''bold''bold-italic'''''plain" =?> + para ("plain" <> strong ("bold" <> emph "bold-italic") <> "plain") + ] + , testGroup "possessives and italics" + [ "simple" =: + "In ''Flaming Pie'''s liner notes" =?> + para ("In " <> emph "Flaming Pie'" <> "s liner notes") + , "linked" =: + "obtained by ''[[Lunar Prospector]]'''s gamma-ray spectrometer" =?> + para ("obtained by " <> emph ((wikilink "Lunar Prospector") <> "'") <> "s gamma-ray spectrometer") + , "with following italics" =: + "''Sebastián Covarrubias''' ''Tesoro''" =?> + para (emph "Sebastián Covarrubias'" <> " " <> emph "Tesoro") + , "with internal link" =: + "the ''Vocabolario dell'[[Accademia della Crusca]]'', for Italian" =?> + para ("the " <> emph ("Vocabolario dell'" <> wikilink "Accademia della Crusca") <> + ", for Italian") + , "multiple" =: + "'''This year''''s election ''should'' beat '''last year''''s." =?> + para (strong "This year'" <> "s election " <> emph "should" <> " beat " <> strong "last year'" <> "s.") + ] + , testGroup "two-quote openings" + [ "2 open 3 close" =: + "''foo'''" =?> + para (emph "foo'") + , "2 open 4 close" =: + "''foo''''" =?> + para (emph "foo''") + -- TODO line ends terminate emphases + -- , "2 open 5 close" =: + -- "''foo'''''" =?> + -- para (emph "foo" <> strong "") + ] + , testGroup "three-quote openings" + [ "3 open 2 close" =: + "'''foo''" =?> + para ("'" <> emph "foo") + , "3 open 3 close" =: + "'''foo'''" =?> + para (strong "foo") + , "3 open 4 close" =: + "'''foo''''" =?> + para (strong "foo'") + -- TODO line ends terminate emphases + -- , "3 open 5 close" =: + -- "'''foo'''''" =?> + -- para (strong "foo" <> emph "" ) + ] + , testGroup "four-quote openings" + [ "4 open 2 close" =: + "''''foo''" =?> + para ("''" <> emph "foo") + , "4 open 3 close" =: + "''''foo'''" =?> + para ("'" <> strong "foo") + , "4 open 4 close" =: + "''''foo''''" =?> + para ("'" <> strong "foo'") + -- TODO line ends terminate emphases + -- , "4 open 5 close" =: + -- "''''foo'''''" =?> + -- para ("'" <> strong "foo" <> emph "") + ] + , testGroup "five-quote openings" + [ -- TODO line ends terminate emphases + -- "5 open 2 close" =: + -- "'''''foo''" =?> + -- para (strong (emph "foo")) + -- , "5 open 3 close" =: + -- "'''''foo'''" =?> + -- para (emph (strong "foo")) + -- , "5 open 4 close" =: + -- "'''''foo''''" =?> + -- para (emph (strong "foo'")) + "5 open 5 close" =: + "'''''foo'''''" =?> + para (emph (strong "foo")) + , "5 open 6 close" =: + "'''''foo''''''" =?> + para (emph (strong "foo'")) + ] + , testGroup "multiple quote sequences" + [ "2, 4, 2" =: + "''foo''''bar''" =?> + para (emph ("foo'" <> strong "bar")) + , "2, 4, 2, more" =: + "''foo''''bar'' something else" =?> + para (emph ("foo'" <> strong "bar")) + ] + ] + ] diff --git a/test/mediawiki-reader.native b/test/mediawiki-reader.native index 7532528b1b86..3c4527ca6f44 100644 --- a/test/mediawiki-reader.native +++ b/test/mediawiki-reader.native @@ -46,8 +46,8 @@ Pandoc , Para [ Emph [ Str "emph" ] , Space , Strong [ Str "strong" ] ] , Para - [ Strong - [ Emph + [ Emph + [ Strong [ Str "strong" , Space , Str "and" , Space , Str "emph" ] ] ] @@ -766,7 +766,7 @@ Pandoc , LineBreak , Emph [ Code ( "" , [] , [] ) "markups" ] , Code ( "" , [] , [] ) "\160" - , Strong [ Emph [ Code ( "" , [] , [] ) "can" ] ] + , Emph [ Strong [ Code ( "" , [] , [] ) "can" ] ] , Code ( "" , [] , [] ) "\160be\160done." ] , Para diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index d310b932f316..ae28c0dccf13 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -30,6 +30,7 @@ import qualified Tests.Readers.RTF import qualified Tests.Readers.Txt2Tags import qualified Tests.Readers.Man import qualified Tests.Readers.Mdoc +import qualified Tests.Readers.MediaWiki import qualified Tests.Readers.Pod import qualified Tests.Shared import qualified Tests.Writers.AsciiDoc @@ -103,6 +104,7 @@ tests pandocPath = testGroup "pandoc tests" , testGroup "FB2" Tests.Readers.FB2.tests , testGroup "DokuWiki" Tests.Readers.DokuWiki.tests , testGroup "Pod" Tests.Readers.Pod.tests + , testGroup "MediaWiki" Tests.Readers.MediaWiki.tests ] ]