From 6abda056eb7b14b4f5da2227fbe89237cc832bc9 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Wed, 14 Dec 2022 17:33:52 +0500 Subject: [PATCH 1/8] [#64] Implement copy/paste protection checks Problem: Currently xrefcheck is not able to detect possibly bad copy-pastes, when some links are referring the same file, but from the link name it seems that one of that links should refer other file. Solution: Implement check, add support for related annotations for `.md` files, add corresponding settings to the config. --- src/Xrefcheck/Scanners/Markdown.hs | 256 ++++++++++++++++++++++------- 1 file changed, 198 insertions(+), 58 deletions(-) diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 8a053cbf..0bba7df5 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -54,6 +54,24 @@ instance Buildable C.Node where build (C.Node _mpos ty mSubs) = nameF (show ty) $ maybe "[]" interpolateBlockListF (nonEmpty mSubs) +data Node a = Node + { _ndPos :: Maybe PosInfo + , _ndType :: NodeType + , _ndInfo :: a + , _ndSubs :: [Node a] + } + +instance Buildable (Node a) where + build (Node _mpos ty _info mSubs) = nameF (show ty) $ + maybe "[]" interpolateBlockListF (nonEmpty mSubs) + +-- Here and below CPC stands for "copy/paste check" +type NodeCPC = Node CopyPasteCheck + +newtype CopyPasteCheck = CopyPasteCheck + { shouldCheck :: Bool + } deriving stock (Show, Eq, Generic) + toPosition :: Maybe PosInfo -> Position toPosition = Position . \case Nothing -> Nothing @@ -68,7 +86,7 @@ toPosition = Position . \case |] -- | Extract text from the topmost node. -nodeExtractText :: (C.Node) -> Text +nodeExtractText :: Node info -> Text nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten where extractText = \case @@ -76,8 +94,8 @@ nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten CODE t -> t _ -> "" - nodeFlatten :: (C.Node) -> [NodeType] - nodeFlatten (C.Node _pos ty subs) = ty : concatMap nodeFlatten subs + nodeFlatten :: Node info -> [NodeType] + nodeFlatten (Node _pos ty _info subs) = ty : concatMap nodeFlatten subs data IgnoreMode @@ -120,6 +138,7 @@ makeLensesFor [("_ignoreMode", "ignoreMode")] 'Ignore data GetAnnotation = IgnoreAnnotation IgnoreMode + | IgnoreCopyPasteCheck IgnoreMode | InvalidAnnotation Text deriving stock (Eq) @@ -127,6 +146,8 @@ data GetAnnotation data ScannerState = ScannerState { _ssIgnore :: Maybe Ignore + , _ssIgnoreCopyPasteCheck :: Maybe Ignore + , _ssParagraphExpectedAfterCpcAnnotation :: Bool , _ssParentNodeType :: Maybe NodeType -- ^ @cataNodeWithParentNodeInfo@ allows to get a @NodeType@ of parent node from this field } @@ -135,7 +156,9 @@ makeLenses ''ScannerState initialScannerState :: ScannerState initialScannerState = ScannerState { _ssIgnore = Nothing + , _ssIgnoreCopyPasteCheck = Nothing , _ssParentNodeType = Nothing + , _ssParagraphExpectedAfterCpcAnnotation = False } type ScannerM a = StateT ScannerState (Writer [ScanError]) a @@ -155,40 +178,54 @@ cataNodeWithParentNodeInfo f node = cataNode f' node map (ssParentNodeType .= Just ty >>) childScanners -- | Find ignore annotations (ignore paragraph and ignore link) --- and remove nodes that should be ignored. -processAnnotations :: FilePath -> C.Node -> Writer [ScanError] C.Node -processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process +-- and remove nodes that should be ignored; +-- find copy/paste check annotations (ignore for paragraph and for link) +-- and label nodes with a boolean meaning whether they should be +-- copy/paste checked. +processAnnotations :: Bool -> FilePath -> C.Node -> Writer [ScanError] NodeCPC +processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParentNodeInfo process where + withGlobalCPC :: Bool -> CopyPasteCheck + withGlobalCPC localEnabled = CopyPasteCheck $ globalCpcCheckEnabled && localEnabled + process :: Maybe PosInfo -> NodeType - -> [ScannerM C.Node] - -> ScannerM C.Node + -> [ScannerM NodeCPC] + -> ScannerM NodeCPC process pos ty subs = do let node = C.Node pos ty [] - use ssIgnore >>= \ign -> do + use ssIgnore >>= \ign -> + use ssIgnoreCopyPasteCheck >>= \ignCPC -> do -- When no `Ignore` state is set check next node for annotation, -- if found then set it as new `IgnoreMode` otherwise skip node. - let mbAnnotation = getAnnotation node - case mbAnnotation of + case getAnnotation node of Just ann -> handleAnnotation pos ty ann Nothing -> do case ty of - PARAGRAPH -> handleParagraph ign pos ty subs - LINK {} -> handleLink ign pos ty subs - IMAGE {} -> handleLink ign pos ty subs - _ -> handleOther ign pos ty subs + PARAGRAPH -> handleParagraph ign ignCPC pos ty subs + LINK {} -> handleLink ign ignCPC pos ty subs + IMAGE {} -> handleLink ign ignCPC pos ty subs + _ -> handleOther ign ignCPC pos ty subs handleLink :: + Maybe Ignore -> Maybe Ignore -> Maybe PosInfo -> NodeType -> - [ScannerM C.Node] -> - ScannerM C.Node - handleLink ign pos ty subs = do - let traverseChildren = C.Node pos ty <$> sequence subs - -- It can be checked that it's correct for all the cases + [ScannerM NodeCPC] -> + ScannerM NodeCPC + handleLink ign ignCPC pos ty subs = do + let shouldCheckCPC = withGlobalCPC $ isNothing ignCPC + let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs + -- It's common for all ignore states ssIgnore .= Nothing + -- If there was a copy/paste ignore annotation that expected link, + -- reset this state + resetCpcIgnoreIfLink + -- If right now there was a copy/paste ignore annotation for paragraph, + -- emit an error and reset these states. + reportExpectedParagraphAfterIgnoreCpcAnnotation ty case ign of Nothing -> traverseChildren @@ -199,74 +236,132 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process pure defNode handleParagraph :: + Maybe Ignore -> Maybe Ignore -> Maybe PosInfo -> NodeType -> - [ScannerM C.Node] -> - ScannerM C.Node - handleParagraph ign pos ty subs = do - let traverseChildren = C.Node pos ty <$> sequence subs + [ScannerM NodeCPC] -> + ScannerM NodeCPC + handleParagraph ign ignCPC pos ty subs = do + let shouldCheckCPC = withGlobalCPC $ isNothing ignCPC + let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs + -- If a new paragraph was expected (this stands for True), now we + -- don't expect paragraphs any more. + ssParagraphExpectedAfterCpcAnnotation .= False node <- case ign of - Nothing -> traverseChildren + Nothing -> + wrapTraverseNodeWithLinkExpectedForCpc traverseChildren Just (Ignore IMSParagraph _) -> do ssIgnore .= Nothing pure defNode Just (Ignore (IMSLink ignoreLinkState) modePos) -> - traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs + wrapTraverseNodeWithLinkExpected ignoreLinkState modePos $ + wrapTraverseNodeWithLinkExpectedForCpc traverseChildren + + ssIgnoreCopyPasteCheck .= Nothing use ssIgnore >>= \case - Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> + Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do lift $ tell $ makeError pragmaPos fp LinkErr + ssIgnore .= Nothing + _ -> pass + use ssIgnoreCopyPasteCheck >>= \case + Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do + lift $ tell $ makeError pragmaPos fp LinkErr -- TODO: different error type + ssIgnoreCopyPasteCheck .= Nothing _ -> pass + pure node handleOther :: + Maybe Ignore -> Maybe Ignore -> Maybe PosInfo -> NodeType -> - [ScannerM C.Node] -> - ScannerM C.Node - handleOther ign pos ty subs = do - let traverseChildren = C.Node pos ty <$> sequence subs + [ScannerM NodeCPC] -> + ScannerM NodeCPC + handleOther ign ignCPC pos ty subs = do + let shouldCheckCPC = withGlobalCPC $ isNothing ignCPC + let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs + -- If right now there was a copy/paste ignore annotation for paragraph, + -- emit an error and reset these states. + reportExpectedParagraphAfterIgnoreCpcAnnotation ty case ign of - Nothing -> traverseChildren + Nothing -> + wrapTraverseNodeWithLinkExpectedForCpc traverseChildren Just (Ignore IMSParagraph modePos) -> do reportExpectedParagraphAfterIgnoreAnnotation modePos ty ssIgnore .= Nothing - traverseChildren - Just (Ignore (IMSLink ignoreLinkState) modePos) -> do - traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs + wrapTraverseNodeWithLinkExpectedForCpc traverseChildren + Just (Ignore (IMSLink ignoreLinkState) modePos) -> + wrapTraverseNodeWithLinkExpected ignoreLinkState modePos $ + wrapTraverseNodeWithLinkExpectedForCpc traverseChildren reportExpectedParagraphAfterIgnoreAnnotation :: Maybe PosInfo -> NodeType -> ScannerM () reportExpectedParagraphAfterIgnoreAnnotation modePos ty = lift . tell . makeError modePos fp . ParagraphErr $ prettyType ty - traverseNodeWithLinkExpected :: + resetCpcIgnoreIfLink :: ScannerM () + resetCpcIgnoreIfLink = do + curCpcIgnore <- use ssIgnoreCopyPasteCheck + case _ignoreMode <$> curCpcIgnore of + Just (IMSLink _) -> ssIgnoreCopyPasteCheck .= Nothing + _ -> pass + + reportExpectedParagraphAfterIgnoreCpcAnnotation :: + NodeType -> ScannerM () + reportExpectedParagraphAfterIgnoreCpcAnnotation ty = + use ssIgnoreCopyPasteCheck >>= \case + Just (Ignore IMSParagraph modePos) -> + whenM (use ssParagraphExpectedAfterCpcAnnotation) $ do + reportExpectedParagraphAfterIgnoreAnnotation modePos ty + ssParagraphExpectedAfterCpcAnnotation .= False + ssIgnoreCopyPasteCheck .= Nothing + _ -> pass + + wrapTraverseNodeWithLinkExpected :: IgnoreLinkState -> Maybe PosInfo -> - Maybe PosInfo -> - NodeType -> - [ScannerM C.Node] -> - ScannerM C.Node - traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs = do - when (ignoreLinkState == ExpectingLinkInSubnodes) $ + ScannerM NodeCPC -> + ScannerM NodeCPC + wrapTraverseNodeWithLinkExpected ignoreLinkState modePos = + if ignoreLinkState /= ExpectingLinkInSubnodes + then id + else \traverse' -> do ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink - node' <- C.Node pos ty <$> sequence subs - when (ignoreLinkState == ExpectingLinkInSubnodes) $ do + node' <- traverse' currentIgnore <- use ssIgnore case currentIgnore of Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do lift $ tell $ makeError modePos fp LinkErr ssIgnore .= Nothing _ -> pass - return node' + return node' + + wrapTraverseNodeWithLinkExpectedForCpc :: + ScannerM NodeCPC -> + ScannerM NodeCPC + wrapTraverseNodeWithLinkExpectedForCpc traverse' = do + ignoreCpc <- use ssIgnoreCopyPasteCheck + case ignoreCpc of + Just (Ignore (IMSLink ExpectingLinkInSubnodes) modePos) -> do + ssIgnoreCopyPasteCheck . _Just . ignoreMode .= IMSLink ParentExpectsLink + node' <- traverse' + currentIgnore <- use ssIgnoreCopyPasteCheck + case currentIgnore of + Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do + lift $ tell $ makeError modePos fp LinkErr -- TODO: different error type + ssIgnoreCopyPasteCheck .= Nothing + _ -> pass + return node' + _ -> traverse' handleAnnotation :: Maybe PosInfo -> NodeType -> GetAnnotation - -> ScannerM C.Node + -> ScannerM NodeCPC handleAnnotation pos nodeType = \case IgnoreAnnotation mode -> do let reportIfThereWasAnnotation :: ScannerM () @@ -300,6 +395,41 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process whenJust mbIgnoreModeState $ \ignoreModeState -> (ssIgnore .= Just (Ignore ignoreModeState correctPos)) pure defNode + IgnoreCopyPasteCheck mode -> do + mbIgnoreModeState <- case mode of + IMLink -> use ssParentNodeType <&> Just . IMSLink . \case + Just PARAGRAPH -> ExpectingLinkInParagraph + _ -> ExpectingLinkInSubnodes + + IMParagraph -> do + ssParagraphExpectedAfterCpcAnnotation .= True + pure $ Just IMSParagraph + + -- We don't expect to find an `ignore all` annotation here, + -- since that annotation should be at the top of the file and + -- any correct annotations should be handled in `checkGlobalAnnotations` + -- function. + IMAll -> do + lift . tell $ makeError correctPos fp FileErr -- TODO: different error type + pure Nothing + + whenJust mbIgnoreModeState $ \ignoreModeState -> do + let setupNewCpcState = ssIgnoreCopyPasteCheck .= Just (Ignore ignoreModeState correctPos) + use ssIgnoreCopyPasteCheck >>= \case + Nothing -> setupNewCpcState + Just (Ignore curIgn prevPos) + | IMSLink _ <- curIgn -> do + lift $ tell $ makeError prevPos fp LinkErr -- TODO: different error type + setupNewCpcState + | IMSParagraph <- curIgn -> case ignoreModeState of + IMSParagraph -> do + lift . tell . makeError prevPos fp . ParagraphErr $ prettyType nodeType -- TODO: different error type + setupNewCpcState + -- It's OK to have link annotation when paragraph is ignored + -- because in this case all links and all annotations are ignored. + _ -> pass + pure defNode + InvalidAnnotation msg -> do lift . tell $ makeError correctPos fp $ UnrecognisedErr msg pure defNode @@ -312,8 +442,8 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process in fromMaybe "" mType withIgnoreMode - :: ScannerM C.Node - -> Writer [ScanError] C.Node + :: ScannerM (Node info) + -> Writer [ScanError] (Node info) withIgnoreMode action = action `runStateT` initialScannerState >>= \case -- We expect `Ignore` state to be `Nothing` when we reach EOF, -- otherwise that means there was an annotation that didn't match @@ -328,8 +458,8 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process (node, _) -> pure node -- | Custom `foldMap` for source tree. -foldNode :: (Monoid a, Monad m) => (C.Node -> m a) -> C.Node -> m a -foldNode action node@(C.Node _ _ subs) = do +foldNode :: (Monoid a, Monad m) => (Node info -> m a) -> Node info -> m a +foldNode action node@(Node _ _ _ subs) = do a <- action node b <- concatForM subs (foldNode action) return (a <> b) @@ -342,16 +472,17 @@ nodeExtractInfo -> C.Node -> ExtractorM FileInfo nodeExtractInfo fp (C.Node nPos nTy nSubs) = do - let (ignoreFile, contentNodes) = checkGlobalAnnotations nSubs + let (ignoreFile, ignoreCpcInFile, contentNodes) = checkGlobalAnnotations nSubs + cpCheckEnabledGlobally = not ignoreCpcInFile if ignoreFile then return def else diffToFileInfo <$> - (lift (processAnnotations fp $ C.Node nPos nTy contentNodes) + (lift (processAnnotations cpCheckEnabledGlobally fp $ C.Node nPos nTy contentNodes) >>= foldNode extractor) where - extractor :: C.Node -> ExtractorM FileInfoDiff - extractor node@(C.Node pos ty _) = + extractor :: NodeCPC -> ExtractorM FileInfoDiff + extractor node@(Node pos ty _info _) = case ty of HTML_BLOCK _ -> do return mempty @@ -405,11 +536,12 @@ nodeExtractInfo fp (C.Node nPos nTy nSubs) = do DList.empty -- | Check for global annotations, ignoring simple comments if there are any. -checkGlobalAnnotations :: [C.Node] -> (Bool, [C.Node]) +checkGlobalAnnotations :: [C.Node] -> (Bool, Bool, [C.Node]) checkGlobalAnnotations nodes = do let (headerNodes, contentsNodes) = span isHeaderNode nodes ignoreFile = any isIgnoreFile headerNodes - (ignoreFile, contentsNodes) + ignoreCpcInFile = any isIgnoreCpcWithinFile headerNodes + (ignoreFile, ignoreCpcInFile, contentsNodes) where isSimpleComment :: C.Node -> Bool isSimpleComment node = do @@ -420,15 +552,20 @@ checkGlobalAnnotations nodes = do isIgnoreFile :: C.Node -> Bool isIgnoreFile = (Just (IgnoreAnnotation IMAll) ==) . getAnnotation + isIgnoreCpcWithinFile :: C.Node -> Bool + isIgnoreCpcWithinFile = (Just (IgnoreCopyPasteCheck IMAll) ==) . getAnnotation + isHeaderNode :: C.Node -> Bool isHeaderNode node = any ($ node) [ isSimpleComment , isIgnoreFile + , isIgnoreCpcWithinFile ] -defNode :: C.Node -defNode = C.Node Nothing DOCUMENT [] -- hard-coded default Node +-- | Hard-coded default Node +defNode :: NodeCPC +defNode = Node Nothing DOCUMENT (CopyPasteCheck False) [] makeError :: Maybe PosInfo @@ -473,6 +610,8 @@ textToMode :: Text -> GetAnnotation textToMode annText = case wordsList of ("ignore" : [x]) | Just ignMode <- getIgnoreMode x -> IgnoreAnnotation ignMode + ("no" : "duplication" : "check" : "in" : [x]) + | Just ignMode <- getIgnoreMode x -> IgnoreCopyPasteCheck ignMode _ -> InvalidAnnotation annText where wordsList = words annText @@ -482,6 +621,7 @@ getIgnoreMode = \case "link" -> Just IMLink "paragraph" -> Just IMParagraph "all" -> Just IMAll + "file" -> Just IMAll _ -> Nothing parseFileInfo :: MarkdownConfig -> FilePath -> LT.Text -> (FileInfo, [ScanError]) From ea1050beccf38dd69052d4fba8130e8bb596eb80 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Tue, 13 Dec 2022 13:49:52 +0500 Subject: [PATCH 2/8] [#64] Implement copy-paste check, add golden tests, add new scan errors --- src/Xrefcheck/Command.hs | 5 +- src/Xrefcheck/Core.hs | 12 ++- src/Xrefcheck/Scan.hs | 13 ++- src/Xrefcheck/Scanners/Markdown.hs | 19 ++-- src/Xrefcheck/Verify.hs | 87 +++++++++++++++++-- tests/Test/Xrefcheck/IgnoreRegexSpec.hs | 2 +- tests/Test/Xrefcheck/TooManyRequestsSpec.hs | 6 +- tests/Test/Xrefcheck/UtilRequests.hs | 2 +- .../check-copy-paste/check-copy-paste.bats | 17 ++++ tests/golden/check-copy-paste/expected.gold | 61 +++++++++++++ tests/golden/check-copy-paste/first-file.md | 12 +++ tests/golden/check-copy-paste/log | 61 +++++++++++++ tests/golden/check-copy-paste/second-file.md | 55 ++++++++++++ tests/golden/check-scan-errors/expected.gold | 4 +- 14 files changed, 325 insertions(+), 31 deletions(-) create mode 100644 tests/golden/check-copy-paste/check-copy-paste.bats create mode 100644 tests/golden/check-copy-paste/expected.gold create mode 100644 tests/golden/check-copy-paste/first-file.md create mode 100644 tests/golden/check-copy-paste/log create mode 100644 tests/golden/check-copy-paste/second-file.md diff --git a/src/Xrefcheck/Command.hs b/src/Xrefcheck/Command.hs index 16620a86..12ee272e 100644 --- a/src/Xrefcheck/Command.hs +++ b/src/Xrefcheck/Command.hs @@ -28,7 +28,7 @@ import Xrefcheck.Scan import Xrefcheck.Scanners.Markdown (markdownSupport) import Xrefcheck.System (askWithinCI) import Xrefcheck.Util -import Xrefcheck.Verify (reportVerifyErrs, verifyErrors, verifyRepo) +import Xrefcheck.Verify (reportCopyPasteErrors, reportVerifyErrs, verifyErrors, verifyRepo) readConfig :: FilePath -> IO Config readConfig path = fmap (normaliseConfigFilePaths . overrideConfig) do @@ -81,11 +81,12 @@ defaultAction Options{..} = do whenJust (nonEmpty $ sortBy (compare `on` seFile) scanErrs) $ reportScanErrs - verifyRes <- allowRewrite showProgressBar $ \rw -> do + (verifyRes, copyPasteErrors) <- allowRewrite showProgressBar $ \rw -> do let fullConfig = config { cNetworking = addNetworkingOptions (cNetworking config) oNetworkingOptions } verifyRepo rw fullConfig oMode oRoot repoInfo + whenJust (nonEmpty copyPasteErrors) reportCopyPasteErrors case verifyErrors verifyRes of Nothing | null scanErrs -> fmtLn "All repository links are valid." Nothing -> exitFailure diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 9c3ae450..6fa7ab14 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -12,6 +12,7 @@ module Xrefcheck.Core where import Universum import Control.Lens (makeLenses) +import Control.Lens.Combinators (makeLensesWith) import Data.Aeson (FromJSON (..), withText) import Data.Char (isAlphaNum) import Data.Char qualified as C @@ -70,14 +71,17 @@ instance Given ColorMode => Buildable Position where -- | Full info about a reference. data Reference = Reference - { rName :: Text + { rName :: Text -- ^ Text displayed as reference. - , rLink :: Text + , rLink :: Text -- ^ File or site reference points to. - , rAnchor :: Maybe Text + , rAnchor :: Maybe Text -- ^ Section or custom anchor tag. - , rPos :: Position + , rPos :: Position + -- ^ Whether to check bad copy/paste for this link + , rCheckCopyPaste :: Bool } deriving stock (Show, Generic) +makeLensesWith postfixFields ''Reference -- | Context of anchor. data AnchorType diff --git a/src/Xrefcheck/Scan.hs b/src/Xrefcheck/Scan.hs index da312a0e..91270a14 100644 --- a/src/Xrefcheck/Scan.hs +++ b/src/Xrefcheck/Scan.hs @@ -117,18 +117,27 @@ data ScanErrorDescription = LinkErr | FileErr | ParagraphErr Text + | LinkErrCpc + | FileErrCpc + | ParagraphErrCpc Text | UnrecognisedErr Text deriving stock (Show, Eq) instance Buildable ScanErrorDescription where build = \case LinkErr -> [int||Expected a LINK after "ignore link" annotation|] + LinkErrCpc -> [int||Expected a LINK after "no duplication check in link" annotation|] FileErr -> [int||Annotation "ignore all" must be at the top of \ markdown or right after comments at the top|] + FileErrCpc -> [int||Annotation "no duplication check in file" must be at the top of \ + markdown or right after comments at the top|] ParagraphErr txt -> [int||Expected a PARAGRAPH after \ "ignore paragraph" annotation, but found #{txt}|] - UnrecognisedErr txt -> [int||Unrecognised option "#{txt}" perhaps you meant \ - <"ignore link"|"ignore paragraph"|"ignore all">|] + ParagraphErrCpc txt -> [int||Expected a PARAGRAPH after \ + "no duplication check in paragraph" annotation, but found #{txt}|] + UnrecognisedErr txt -> [int||Unrecognised option "#{txt}", perhaps you meant + <"ignore link"|"ignore paragraph"|"ignore all"> + or "no duplication check in "?|] specificFormatsSupport :: [([Extension], ScanAction)] -> FormatsSupport specificFormatsSupport formats = \ext -> M.lookup ext formatsMap diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 0bba7df5..2ab23a6c 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -69,7 +69,7 @@ instance Buildable (Node a) where type NodeCPC = Node CopyPasteCheck newtype CopyPasteCheck = CopyPasteCheck - { shouldCheck :: Bool + { cpcShouldCheck :: Bool } deriving stock (Show, Eq, Generic) toPosition :: Maybe PosInfo -> Position @@ -267,7 +267,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen _ -> pass use ssIgnoreCopyPasteCheck >>= \case Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do - lift $ tell $ makeError pragmaPos fp LinkErr -- TODO: different error type + lift $ tell $ makeError pragmaPos fp LinkErrCpc ssIgnoreCopyPasteCheck .= Nothing _ -> pass @@ -315,7 +315,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen use ssIgnoreCopyPasteCheck >>= \case Just (Ignore IMSParagraph modePos) -> whenM (use ssParagraphExpectedAfterCpcAnnotation) $ do - reportExpectedParagraphAfterIgnoreAnnotation modePos ty + lift . tell . makeError modePos fp . ParagraphErrCpc $ prettyType ty ssParagraphExpectedAfterCpcAnnotation .= False ssIgnoreCopyPasteCheck .= Nothing _ -> pass @@ -351,7 +351,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen currentIgnore <- use ssIgnoreCopyPasteCheck case currentIgnore of Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do - lift $ tell $ makeError modePos fp LinkErr -- TODO: different error type + lift $ tell $ makeError modePos fp LinkErrCpc ssIgnoreCopyPasteCheck .= Nothing _ -> pass return node' @@ -410,7 +410,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen -- any correct annotations should be handled in `checkGlobalAnnotations` -- function. IMAll -> do - lift . tell $ makeError correctPos fp FileErr -- TODO: different error type + lift . tell $ makeError correctPos fp FileErrCpc pure Nothing whenJust mbIgnoreModeState $ \ignoreModeState -> do @@ -419,11 +419,11 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen Nothing -> setupNewCpcState Just (Ignore curIgn prevPos) | IMSLink _ <- curIgn -> do - lift $ tell $ makeError prevPos fp LinkErr -- TODO: different error type + lift $ tell $ makeError prevPos fp LinkErrCpc setupNewCpcState | IMSParagraph <- curIgn -> case ignoreModeState of IMSParagraph -> do - lift . tell . makeError prevPos fp . ParagraphErr $ prettyType nodeType -- TODO: different error type + lift . tell . makeError prevPos fp . ParagraphErrCpc $ prettyType nodeType setupNewCpcState -- It's OK to have link annotation when paragraph is ignored -- because in this case all links and all annotations are ignored. @@ -482,7 +482,7 @@ nodeExtractInfo fp (C.Node nPos nTy nSubs) = do where extractor :: NodeCPC -> ExtractorM FileInfoDiff - extractor node@(Node pos ty _info _) = + extractor node@(Node pos ty info _) = case ty of HTML_BLOCK _ -> do return mempty @@ -532,7 +532,8 @@ nodeExtractInfo fp (C.Node nPos nTy nSubs) = do t : ts -> (t, Just $ T.intercalate "#" ts) [] -> error "impossible" return $ FileInfoDiff - (DList.singleton $ Reference {rName, rPos, rLink, rAnchor}) + (DList.singleton $ + Reference {rName, rPos, rLink, rAnchor, rCheckCopyPaste = cpcShouldCheck info}) DList.empty -- | Check for global annotations, ignoring simple comments if there are any. diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index 1ed1fdbb..ac8fac8e 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -28,7 +28,10 @@ module Xrefcheck.Verify -- * URI parsing , parseUri + + -- * Reporting errors , reportVerifyErrs + , reportCopyPasteErrors ) where import Universum @@ -37,9 +40,11 @@ import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync) import Control.Exception (AsyncException (..), throwIO) import Control.Monad.Except (MonadError (..)) import Data.ByteString qualified as BS +import Data.Char (isAlphaNum) import Data.List qualified as L import Data.Map qualified as M import Data.Reflection (Given) +import Data.Text qualified as T import Data.Text.Metrics (damerauLevenshteinNorm) import Data.Time (UTCTime, defaultTimeLocale, formatTime, readPTime, rfc822DateFormat) import Data.Time.Clock.POSIX (getPOSIXTime) @@ -255,6 +260,21 @@ instance Given ColorMode => Buildable VerifyError where #{redirectedUrl} |] +data CopyPasteCheckResult = CopyPasteCheckResult + { crFile :: FilePath, + crOriginalRef :: Reference, + crCopiedRef :: Reference + } + +instance (Given ColorMode) => Buildable CopyPasteCheckResult where + build CopyPasteCheckResult {..} = + [int|| + In file #{styleIfNeeded Faint (styleIfNeeded Bold crFile)} + #{crCopiedRef}\ + is possibly a bad copy paste of + #{crOriginalRef} + |] + reportVerifyErrs :: Given ColorMode => NonEmpty (WithReferenceLoc VerifyError) -> IO () reportVerifyErrs errs = fmt @@ -265,6 +285,17 @@ reportVerifyErrs errs = fmt Invalid references dumped, #{length errs} in total. |] +reportCopyPasteErrors + :: Given ColorMode => NonEmpty CopyPasteCheckResult -> IO () +reportCopyPasteErrors errs = fmt + [int|| + === Possible copy/paste errors === + + #{interpolateIndentF 2 (interpolateBlockListF' "➥ " build errs)} + Possible copy/paste errors dumped, #{length errs} in total. + |] + + data RetryAfter = Date UTCTime | Seconds (Time Second) deriving stock (Show, Eq) @@ -355,7 +386,7 @@ verifyRepo -> VerifyMode -> FilePath -> RepoInfo - -> IO (VerifyResult $ WithReferenceLoc VerifyError) + -> IO (VerifyResult $ WithReferenceLoc VerifyError, [CopyPasteCheckResult]) verifyRepo rw config@Config{..} @@ -363,24 +394,29 @@ verifyRepo root repoInfo'@(RepoInfo files _) = do - let toScan = do - (file, fileInfo) <- M.toList files + + let filesToScan = flip mapMaybe (M.toList files) $ \(file, fileInfo) -> do guard . not $ matchesGlobPatterns root (ecIgnoreRefsFrom cExclusions) file case fileInfo of Scanned fi -> do - ref <- _fiReferences fi - return (file, ref) - NotScannable -> empty -- No support for such file, can do nothing. - NotAddedToGit -> empty -- If this file is scannable, we've notified + Just (file, _fiReferences fi) + NotScannable -> Nothing -- No support for such file, can do nothing. + NotAddedToGit -> Nothing -- If this file is scannable, we've notified -- user that we are scanning only files -- added to Git while gathering RepoInfo. + shouldCheckCopyPaste _ = True + toCheckCopyPaste = filter (\(file, _refs) -> shouldCheckCopyPaste file) filesToScan + toScan = concatMap (\(file, refs) -> map (file,) refs) filesToScan + copyPasteErrors = [ res + | (file, refs) <- toCheckCopyPaste, res <- checkCopyPaste file refs] + progressRef <- newIORef $ initVerifyProgress (map snd toScan) accumulated <- loopAsyncUntil (printer progressRef) do forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) -> verifyReference config mode progressRef repoInfo' root file ref - case accumulated of + (, copyPasteErrors) <$> case accumulated of Right res -> return $ fold res Left (exception, partialRes) -> do -- The user has hit Ctrl+C; display any verification errors we managed to find and exit. @@ -412,6 +448,41 @@ verifyRepo ExternalLoc -> CacheUnderKey rLink _ -> NoCaching +checkCopyPaste :: FilePath -> [Reference] -> [CopyPasteCheckResult] +checkCopyPaste file refs = do + let groupedRefs = + L.groupBy ((==) `on` rLink) $ + sortBy (compare `on` rLink) $ + filter rCheckCopyPaste refs + concatMap checkGroup groupedRefs + where + checkGroup :: [Reference] -> [CopyPasteCheckResult] + checkGroup refsInGroup = do + let refsInGroup' = flip map refsInGroup $ \ref -> + (ref, (prepareRefName (rName ref), prepareRefLink (rLink ref))) + let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup' + others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup' + maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef + + textIsLinkSubstr :: (Text, Text) -> Bool + textIsLinkSubstr (prepName, prepLink) = prepName `isSubSeq` prepLink + + +prepareRefName :: Text -> Text +prepareRefName = T.toLower . T.filter isAlphaNum + +prepareRefLink :: Text -> Text +prepareRefLink = T.toLower + +isSubSeq :: Text -> Text -> Bool +isSubSeq "" _str = True +isSubSeq _que "" = False +isSubSeq que str + | qhead == shead = isSubSeq qtail stail + | otherwise = isSubSeq que stail + where (qhead, qtail) = T.splitAt 1 que + (shead, stail) = T.splitAt 1 str + shouldCheckLocType :: VerifyMode -> LocationType -> Bool shouldCheckLocType mode locType | isExternal locType = shouldCheckExternal mode diff --git a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs index c3143086..31bd1d53 100644 --- a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs +++ b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs @@ -44,7 +44,7 @@ test_ignoreRegex = give WithoutColors $ verifyRes <- allowRewrite showProgressBar $ \rw -> verifyRepo rw config verifyMode root $ srRepoInfo scanResult - let brokenLinks = pickBrokenLinks verifyRes + let brokenLinks = pickBrokenLinks $ fst verifyRes let matchedLinks = [ "https://bad.referenc/" diff --git a/tests/Test/Xrefcheck/TooManyRequestsSpec.hs b/tests/Test/Xrefcheck/TooManyRequestsSpec.hs index 8b07d490..a61ae50e 100644 --- a/tests/Test/Xrefcheck/TooManyRequestsSpec.hs +++ b/tests/Test/Xrefcheck/TooManyRequestsSpec.hs @@ -63,7 +63,7 @@ test_tooManyRequests = testGroup "429 response tests" } } _ <- verifyReferenceWithProgress - (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing)) + (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False) progressRef Progress{..} <- vrExternal <$> readIORef progressRef let ttc = ttTimeToCompletion <$> pTaskTimestamp @@ -88,7 +88,7 @@ test_tooManyRequests = testGroup "429 response tests" } } _ <- verifyReferenceWithProgress - (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing)) + (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False) progressRef Progress{..} <- vrExternal <$> readIORef progressRef let ttc = fromMaybe (sec 0) $ ttTimeToCompletion <$> pTaskTimestamp @@ -114,7 +114,7 @@ test_tooManyRequests = testGroup "429 response tests" } } _ <- verifyReferenceWithProgress - (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing)) + (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False) progressRef Progress{..} <- vrExternal <$> readIORef progressRef let ttc = ttTimeToCompletion <$> pTaskTimestamp diff --git a/tests/Test/Xrefcheck/UtilRequests.hs b/tests/Test/Xrefcheck/UtilRequests.hs index 8582cd8c..1285a600 100644 --- a/tests/Test/Xrefcheck/UtilRequests.hs +++ b/tests/Test/Xrefcheck/UtilRequests.hs @@ -62,7 +62,7 @@ checkLinkAndProgressWithServer mock link progress vrExpectation = verifyLink :: Text -> IO (VerifyResult VerifyError, Progress Int) verifyLink link = do - let reference = Reference "" link Nothing (Position Nothing) + let reference = Reference "" link Nothing (Position Nothing) False progRef <- newIORef $ initVerifyProgress [reference] result <- verifyReferenceWithProgress reference progRef p <- readIORef progRef diff --git a/tests/golden/check-copy-paste/check-copy-paste.bats b/tests/golden/check-copy-paste/check-copy-paste.bats new file mode 100644 index 00000000..45971b8a --- /dev/null +++ b/tests/golden/check-copy-paste/check-copy-paste.bats @@ -0,0 +1,17 @@ +#!/usr/bin/env bats + +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: MPL-2.0 + +load '../helpers/bats-support/load' +load '../helpers/bats-assert/load' +load '../helpers/bats-file/load' +load '../helpers' + + +@test "Check possible copy-paste errors and copy-paste annotations " { + to_temp xrefcheck + + assert_diff expected.gold +} diff --git a/tests/golden/check-copy-paste/expected.gold b/tests/golden/check-copy-paste/expected.gold new file mode 100644 index 00000000..b085c179 --- /dev/null +++ b/tests/golden/check-copy-paste/expected.gold @@ -0,0 +1,61 @@ +=== Scan errors found === + + ➥ In file second-file.md + scan error at src:12:1-25: + + Unrecognised option "no dh", perhaps you meant + <"ignore link"|"ignore paragraph"|"ignore all"> + or "no duplication check in "? + + ➥ In file second-file.md + scan error at src:15:1-53: + + Expected a PARAGRAPH after "no duplication check in paragraph" annotation, but found HEADING + + ➥ In file second-file.md + scan error at src:20:1-48: + + Expected a LINK after "no duplication check in link" annotation + + ➥ In file second-file.md + scan error at src:25:1-48: + + Annotation "no duplication check in file" must be at the top of markdown or right after comments at the top + +Scan errors dumped, 4 in total. +=== Possible copy/paste errors === + + ➥ In file second-file.md + reference (relative) at src:38:1-28: + - text: "Link 5" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (relative) at src:39:1-29: + - text: "Lol Kek" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (external) at src:44:1-28: + - text: "gitlab" + - link: https://github.com + - anchor: - + is possibly a bad copy paste of + reference (external) at src:43:1-28: + - text: "github" + - link: https://github.com + - anchor: - + +Possible copy/paste errors dumped, 3 in total. diff --git a/tests/golden/check-copy-paste/first-file.md b/tests/golden/check-copy-paste/first-file.md new file mode 100644 index 00000000..7e106e99 --- /dev/null +++ b/tests/golden/check-copy-paste/first-file.md @@ -0,0 +1,12 @@ + + + + + +[ Second - ---file- ](./second-file.md) +[ Link 2](./second-file.md) + diff --git a/tests/golden/check-copy-paste/log b/tests/golden/check-copy-paste/log new file mode 100644 index 00000000..b085c179 --- /dev/null +++ b/tests/golden/check-copy-paste/log @@ -0,0 +1,61 @@ +=== Scan errors found === + + ➥ In file second-file.md + scan error at src:12:1-25: + + Unrecognised option "no dh", perhaps you meant + <"ignore link"|"ignore paragraph"|"ignore all"> + or "no duplication check in "? + + ➥ In file second-file.md + scan error at src:15:1-53: + + Expected a PARAGRAPH after "no duplication check in paragraph" annotation, but found HEADING + + ➥ In file second-file.md + scan error at src:20:1-48: + + Expected a LINK after "no duplication check in link" annotation + + ➥ In file second-file.md + scan error at src:25:1-48: + + Annotation "no duplication check in file" must be at the top of markdown or right after comments at the top + +Scan errors dumped, 4 in total. +=== Possible copy/paste errors === + + ➥ In file second-file.md + reference (relative) at src:38:1-28: + - text: "Link 5" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (relative) at src:39:1-29: + - text: "Lol Kek" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (external) at src:44:1-28: + - text: "gitlab" + - link: https://github.com + - anchor: - + is possibly a bad copy paste of + reference (external) at src:43:1-28: + - text: "github" + - link: https://github.com + - anchor: - + +Possible copy/paste errors dumped, 3 in total. diff --git a/tests/golden/check-copy-paste/second-file.md b/tests/golden/check-copy-paste/second-file.md new file mode 100644 index 00000000..e7f08450 --- /dev/null +++ b/tests/golden/check-copy-paste/second-file.md @@ -0,0 +1,55 @@ + + +[ First file ](./first-file.md) + + +[ Link 2](./first-file.md) + + + + + + +# asd + + + + +# asd + + + + + + + +[ Link 3](./first-file.md) + + + + +[ Link 4](./first-file.md) + + +[ Link 5](./first-file.md) +[ Lol Kek](./first-file.md) + + + +[github](https://github.com) +[gitlab](https://github.com) + + + +[github](https://github.com) +[gitlab](https://github.com) + + + + +[github](https://github.com) +[gitlab](https://github.com) diff --git a/tests/golden/check-scan-errors/expected.gold b/tests/golden/check-scan-errors/expected.gold index a1933ffe..ca734335 100644 --- a/tests/golden/check-scan-errors/expected.gold +++ b/tests/golden/check-scan-errors/expected.gold @@ -18,7 +18,9 @@ ➥ In file check-scan-errors.md scan error at src:21:1-50: - Unrecognised option "ignore unrecognised-annotation" perhaps you meant <"ignore link"|"ignore paragraph"|"ignore all"> + Unrecognised option "ignore unrecognised-annotation", perhaps you meant + <"ignore link"|"ignore paragraph"|"ignore all"> + or "no duplication check in "? ➥ In file check-second-file.md scan error at src:9:1-29: From 427fb0ae0a832dc7cd737153aa0ec0832f1c2830 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Thu, 15 Dec 2022 00:26:55 +0500 Subject: [PATCH 3/8] Add copy-paste setting config --- src/Xrefcheck/Config.hs | 5 +++++ src/Xrefcheck/Config/Default.hs | 3 +++ src/Xrefcheck/Core.hs | 13 +++++++------ src/Xrefcheck/Scanners/Markdown.hs | 18 +++++++----------- src/Xrefcheck/Verify.hs | 15 +++++++++------ tests/configs/github-config.yaml | 3 +++ 6 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/Xrefcheck/Config.hs b/src/Xrefcheck/Config.hs index 1496167b..a0d2ff4b 100644 --- a/src/Xrefcheck/Config.hs +++ b/src/Xrefcheck/Config.hs @@ -73,6 +73,8 @@ data ScannersConfig' f = ScannersConfig , scAnchorSimilarityThreshold :: Field f Double -- ^ On 'anchor not found' error, how much similar anchors should be displayed as -- hint. Number should be between 0 and 1, larger value means stricter filter. + , scCopyPasteCheckEnabled :: Field f Bool + -- ^ Whether copy-paste check is enabled globally. } deriving stock (Generic) makeLensesWith postfixFields ''Config' @@ -94,6 +96,9 @@ overrideConfig config , scAnchorSimilarityThreshold = fromMaybe (scAnchorSimilarityThreshold defScanners) $ scAnchorSimilarityThreshold (cScanners config) + , scCopyPasteCheckEnabled = + fromMaybe (scCopyPasteCheckEnabled defScanners) + $ scCopyPasteCheckEnabled (cScanners config) } } where diff --git a/src/Xrefcheck/Config/Default.hs b/src/Xrefcheck/Config/Default.hs index 72bcd1a2..8550b0bd 100644 --- a/src/Xrefcheck/Config/Default.hs +++ b/src/Xrefcheck/Config/Default.hs @@ -67,6 +67,9 @@ scanners: # # This affects which anchors are generated for headers. flavor: #s{flavor} + + # Whether copy-paste check is enabled globally. + copyPasteCheckEnabled: True |] where ignoreLocalRefsFrom :: NonEmpty Text diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 6fa7ab14..190734bd 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -106,9 +106,9 @@ data FileInfoDiff = FileInfoDiff } makeLenses ''FileInfoDiff -diffToFileInfo :: FileInfoDiff -> FileInfo -diffToFileInfo (FileInfoDiff refs anchors) = - FileInfo (DList.toList refs) (DList.toList anchors) +diffToFileInfo :: Bool -> FileInfoDiff -> FileInfo +diffToFileInfo ignoreCpcInFile (FileInfoDiff refs anchors) = + FileInfo (DList.toList refs) (DList.toList anchors) ignoreCpcInFile instance Semigroup FileInfoDiff where FileInfoDiff a b <> FileInfoDiff c d = FileInfoDiff (a <> c) (b <> d) @@ -118,13 +118,14 @@ instance Monoid FileInfoDiff where -- | All information regarding a single file we care about. data FileInfo = FileInfo - { _fiReferences :: [Reference] - , _fiAnchors :: [Anchor] + { _fiReferences :: [Reference] + , _fiAnchors :: [Anchor] + , _fiCopyPasteCheck :: Bool } deriving stock (Show, Generic) makeLenses ''FileInfo instance Default FileInfo where - def = diffToFileInfo mempty + def = diffToFileInfo True mempty data ScanPolicy = OnlyTracked diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 2ab23a6c..6a439f8c 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -182,12 +182,9 @@ cataNodeWithParentNodeInfo f node = cataNode f' node -- find copy/paste check annotations (ignore for paragraph and for link) -- and label nodes with a boolean meaning whether they should be -- copy/paste checked. -processAnnotations :: Bool -> FilePath -> C.Node -> Writer [ScanError] NodeCPC -processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParentNodeInfo process +processAnnotations :: FilePath -> C.Node -> Writer [ScanError] NodeCPC +processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process where - withGlobalCPC :: Bool -> CopyPasteCheck - withGlobalCPC localEnabled = CopyPasteCheck $ globalCpcCheckEnabled && localEnabled - process :: Maybe PosInfo -> NodeType @@ -216,7 +213,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen [ScannerM NodeCPC] -> ScannerM NodeCPC handleLink ign ignCPC pos ty subs = do - let shouldCheckCPC = withGlobalCPC $ isNothing ignCPC + let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs -- It's common for all ignore states ssIgnore .= Nothing @@ -243,7 +240,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen [ScannerM NodeCPC] -> ScannerM NodeCPC handleParagraph ign ignCPC pos ty subs = do - let shouldCheckCPC = withGlobalCPC $ isNothing ignCPC + let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs -- If a new paragraph was expected (this stands for True), now we -- don't expect paragraphs any more. @@ -281,7 +278,7 @@ processAnnotations globalCpcCheckEnabled fp = withIgnoreMode . cataNodeWithParen [ScannerM NodeCPC] -> ScannerM NodeCPC handleOther ign ignCPC pos ty subs = do - let shouldCheckCPC = withGlobalCPC $ isNothing ignCPC + let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs -- If right now there was a copy/paste ignore annotation for paragraph, -- emit an error and reset these states. @@ -473,11 +470,10 @@ nodeExtractInfo -> ExtractorM FileInfo nodeExtractInfo fp (C.Node nPos nTy nSubs) = do let (ignoreFile, ignoreCpcInFile, contentNodes) = checkGlobalAnnotations nSubs - cpCheckEnabledGlobally = not ignoreCpcInFile if ignoreFile then return def - else diffToFileInfo <$> - (lift (processAnnotations cpCheckEnabledGlobally fp $ C.Node nPos nTy contentNodes) + else diffToFileInfo (not ignoreCpcInFile) <$> + (lift (processAnnotations fp $ C.Node nPos nTy contentNodes) >>= foldNode extractor) where diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index ac8fac8e..e0a0e46d 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -399,17 +399,20 @@ verifyRepo guard . not $ matchesGlobPatterns root (ecIgnoreRefsFrom cExclusions) file case fileInfo of Scanned fi -> do - Just (file, _fiReferences fi) + Just (file, fi) NotScannable -> Nothing -- No support for such file, can do nothing. NotAddedToGit -> Nothing -- If this file is scannable, we've notified -- user that we are scanning only files -- added to Git while gathering RepoInfo. - shouldCheckCopyPaste _ = True - toCheckCopyPaste = filter (\(file, _refs) -> shouldCheckCopyPaste file) filesToScan - toScan = concatMap (\(file, refs) -> map (file,) refs) filesToScan - copyPasteErrors = [ res - | (file, refs) <- toCheckCopyPaste, res <- checkCopyPaste file refs] + toCheckCopyPaste = map (second _fiReferences) $ filter (_fiCopyPasteCheck . snd) filesToScan + toScan = concatMap (\(file, fileInfo) -> map (file,) $ _fiReferences fileInfo) filesToScan + copyPasteErrors = if scCopyPasteCheckEnabled cScanners + then [ res + | (file, refs) <- toCheckCopyPaste, + res <- checkCopyPaste file refs + ] + else [] progressRef <- newIORef $ initVerifyProgress (map snd toScan) diff --git a/tests/configs/github-config.yaml b/tests/configs/github-config.yaml index 35e3a088..c0698528 100644 --- a/tests/configs/github-config.yaml +++ b/tests/configs/github-config.yaml @@ -56,3 +56,6 @@ scanners: # # This affects which anchors are generated for headers. flavor: GitHub + + # Whether copy-paste check is enabled globally. + copyPasteCheckEnabled: True From 0300735a1be2c07e10e1557ac7af262824eed49e Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Thu, 15 Dec 2022 01:19:12 +0500 Subject: [PATCH 4/8] Remove temporary file --- tests/golden/check-copy-paste/log | 61 ------------------------------- 1 file changed, 61 deletions(-) delete mode 100644 tests/golden/check-copy-paste/log diff --git a/tests/golden/check-copy-paste/log b/tests/golden/check-copy-paste/log deleted file mode 100644 index b085c179..00000000 --- a/tests/golden/check-copy-paste/log +++ /dev/null @@ -1,61 +0,0 @@ -=== Scan errors found === - - ➥ In file second-file.md - scan error at src:12:1-25: - - Unrecognised option "no dh", perhaps you meant - <"ignore link"|"ignore paragraph"|"ignore all"> - or "no duplication check in "? - - ➥ In file second-file.md - scan error at src:15:1-53: - - Expected a PARAGRAPH after "no duplication check in paragraph" annotation, but found HEADING - - ➥ In file second-file.md - scan error at src:20:1-48: - - Expected a LINK after "no duplication check in link" annotation - - ➥ In file second-file.md - scan error at src:25:1-48: - - Annotation "no duplication check in file" must be at the top of markdown or right after comments at the top - -Scan errors dumped, 4 in total. -=== Possible copy/paste errors === - - ➥ In file second-file.md - reference (relative) at src:38:1-28: - - text: "Link 5" - - link: ./first-file.md - - anchor: - - is possibly a bad copy paste of - reference (relative) at src:7:1-34: - - text: "First file" - - link: ./first-file.md - - anchor: - - - ➥ In file second-file.md - reference (relative) at src:39:1-29: - - text: "Lol Kek" - - link: ./first-file.md - - anchor: - - is possibly a bad copy paste of - reference (relative) at src:7:1-34: - - text: "First file" - - link: ./first-file.md - - anchor: - - - ➥ In file second-file.md - reference (external) at src:44:1-28: - - text: "gitlab" - - link: https://github.com - - anchor: - - is possibly a bad copy paste of - reference (external) at src:43:1-28: - - text: "github" - - link: https://github.com - - anchor: - - -Possible copy/paste errors dumped, 3 in total. From 12ef8493a0baf0807b6d7121234d11eed1871bf9 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Thu, 15 Dec 2022 19:16:46 +0500 Subject: [PATCH 5/8] Take anchors into account for copypaste check --- src/Xrefcheck/Verify.hs | 38 +++++++------- tests/golden/check-copy-paste/expected.gold | 37 +++++++++----- tests/golden/check-copy-paste/first-file.md | 8 ++- tests/golden/check-copy-paste/second-file.md | 54 ++++++++++++++------ 4 files changed, 87 insertions(+), 50 deletions(-) diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index e0a0e46d..6980b737 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -453,16 +453,20 @@ verifyRepo checkCopyPaste :: FilePath -> [Reference] -> [CopyPasteCheckResult] checkCopyPaste file refs = do - let groupedRefs = - L.groupBy ((==) `on` rLink) $ - sortBy (compare `on` rLink) $ + let getLinkAndAnchor x = (rLink x, rAnchor x) + groupedRefs = + L.groupBy ((==) `on` getLinkAndAnchor) $ + sortBy (compare `on` getLinkAndAnchor) $ filter rCheckCopyPaste refs concatMap checkGroup groupedRefs where checkGroup :: [Reference] -> [CopyPasteCheckResult] checkGroup refsInGroup = do + let mergeLinkAndAnchor ref = maybe (rLink ref) (rLink ref <>) $ rAnchor ref let refsInGroup' = flip map refsInGroup $ \ref -> - (ref, (prepareRefName (rName ref), prepareRefLink (rLink ref))) + (ref, (prepareNameForCheck $ rName ref, + prepareNameForCheck $ mergeLinkAndAnchor ref)) + -- most of time this will be Nothing and we won't need `others` let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup' others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup' maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef @@ -470,21 +474,17 @@ checkCopyPaste file refs = do textIsLinkSubstr :: (Text, Text) -> Bool textIsLinkSubstr (prepName, prepLink) = prepName `isSubSeq` prepLink - -prepareRefName :: Text -> Text -prepareRefName = T.toLower . T.filter isAlphaNum - -prepareRefLink :: Text -> Text -prepareRefLink = T.toLower - -isSubSeq :: Text -> Text -> Bool -isSubSeq "" _str = True -isSubSeq _que "" = False -isSubSeq que str - | qhead == shead = isSubSeq qtail stail - | otherwise = isSubSeq que stail - where (qhead, qtail) = T.splitAt 1 que - (shead, stail) = T.splitAt 1 str + prepareNameForCheck :: Text -> Text + prepareNameForCheck = T.toLower . T.filter isAlphaNum + + isSubSeq :: Text -> Text -> Bool + isSubSeq "" _str = True + isSubSeq _que "" = False + isSubSeq que str + | qhead == shead = isSubSeq qtail stail + | otherwise = isSubSeq que stail + where (qhead, qtail) = T.splitAt 1 que + (shead, stail) = T.splitAt 1 str shouldCheckLocType :: VerifyMode -> LocationType -> Bool shouldCheckLocType mode locType diff --git a/tests/golden/check-copy-paste/expected.gold b/tests/golden/check-copy-paste/expected.gold index b085c179..1077dd7d 100644 --- a/tests/golden/check-copy-paste/expected.gold +++ b/tests/golden/check-copy-paste/expected.gold @@ -1,24 +1,24 @@ === Scan errors found === ➥ In file second-file.md - scan error at src:12:1-25: + scan error at src:35:1-25: Unrecognised option "no dh", perhaps you meant <"ignore link"|"ignore paragraph"|"ignore all"> or "no duplication check in "? ➥ In file second-file.md - scan error at src:15:1-53: + scan error at src:40:1-53: Expected a PARAGRAPH after "no duplication check in paragraph" annotation, but found HEADING ➥ In file second-file.md - scan error at src:20:1-48: + scan error at src:46:1-48: Expected a LINK after "no duplication check in link" annotation ➥ In file second-file.md - scan error at src:25:1-48: + scan error at src:51:1-48: Annotation "no duplication check in file" must be at the top of markdown or right after comments at the top @@ -26,36 +26,47 @@ Scan errors dumped, 4 in total. === Possible copy/paste errors === ➥ In file second-file.md - reference (relative) at src:38:1-28: - - text: "Link 5" + reference (relative) at src:20:1-29: + - text: "Lol Kek" - link: ./first-file.md - anchor: - is possibly a bad copy paste of - reference (relative) at src:7:1-34: + reference (relative) at src:10:1-34: - text: "First file" - link: ./first-file.md - anchor: - ➥ In file second-file.md - reference (relative) at src:39:1-29: - - text: "Lol Kek" + reference (relative) at src:21:1-30: + - text: "Baz quux" - link: ./first-file.md - anchor: - is possibly a bad copy paste of - reference (relative) at src:7:1-34: + reference (relative) at src:10:1-34: - text: "First file" - link: ./first-file.md - anchor: - ➥ In file second-file.md - reference (external) at src:44:1-28: + reference (relative) at src:31:1-29: + - text: "fdw" + - link: ./first-file.md + - anchor: chor + is possibly a bad copy paste of + reference (relative) at src:30:1-32: + - text: "ff-cho" + - link: ./first-file.md + - anchor: chor + + ➥ In file second-file.md + reference (external) at src:70:1-28: - text: "gitlab" - link: https://github.com - anchor: - is possibly a bad copy paste of - reference (external) at src:43:1-28: + reference (external) at src:69:1-28: - text: "github" - link: https://github.com - anchor: - -Possible copy/paste errors dumped, 3 in total. +Possible copy/paste errors dumped, 4 in total. diff --git a/tests/golden/check-copy-paste/first-file.md b/tests/golden/check-copy-paste/first-file.md index 7e106e99..2b4cf38c 100644 --- a/tests/golden/check-copy-paste/first-file.md +++ b/tests/golden/check-copy-paste/first-file.md @@ -4,9 +4,15 @@ - SPDX-License-Identifier: MPL-2.0 --> - + [ Second - ---file- ](./second-file.md) [ Link 2](./second-file.md) +# heading + +# anch + +# chor diff --git a/tests/golden/check-copy-paste/second-file.md b/tests/golden/check-copy-paste/second-file.md index e7f08450..43297e4a 100644 --- a/tests/golden/check-copy-paste/second-file.md +++ b/tests/golden/check-copy-paste/second-file.md @@ -4,19 +4,45 @@ - SPDX-License-Identifier: MPL-2.0 --> + + [ First file ](./first-file.md) + [ Link 2](./first-file.md) + +[ Link 3](./first-file.md#heading) + + +[ Lol Kek](./first-file.md) +[ Baz quux](./first-file.md) + + + +[ asd](./first-file.md#anch) +[ fdw](./first-file.md#anch) + + + +[ ff-cho](./first-file.md#chor) +[ fdw](./first-file.md#chor) + + + - + + + # asd - + + # asd @@ -24,32 +50,26 @@ - + [ Link 3](./first-file.md) + + + +hello, how are you, bye - - -[ Link 4](./first-file.md) - - -[ Link 5](./first-file.md) -[ Lol Kek](./first-file.md) - - - + + [github](https://github.com) [gitlab](https://github.com) - - + [github](https://github.com) [gitlab](https://github.com) - + - [github](https://github.com) [gitlab](https://github.com) From f96bc8e6ef42f2c229b73dc8cf5a8b891aa72999 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Thu, 15 Dec 2022 20:26:27 +0500 Subject: [PATCH 6/8] Add unit tests for no duplication check pragmas --- src/Xrefcheck/Core.hs | 2 +- tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs | 83 ++++++++++++++----- .../with-annotations/ignore_file_cpc.md | 19 +++++ .../with-annotations/ignore_link_cpc.md | 49 +++++++++++ .../with-annotations/ignore_paragraph_cpc.md | 16 ++++ .../markdowns/with-annotations/no_link_cpc.md | 8 ++ .../with-annotations/no_paragraph_cpc.md | 9 ++ .../unexpected_ignore_file_cpc.md | 11 +++ 8 files changed, 173 insertions(+), 24 deletions(-) create mode 100644 tests/markdowns/with-annotations/ignore_file_cpc.md create mode 100644 tests/markdowns/with-annotations/ignore_link_cpc.md create mode 100644 tests/markdowns/with-annotations/ignore_paragraph_cpc.md create mode 100644 tests/markdowns/with-annotations/no_link_cpc.md create mode 100644 tests/markdowns/with-annotations/no_paragraph_cpc.md create mode 100644 tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 190734bd..18e02114 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -78,8 +78,8 @@ data Reference = Reference , rAnchor :: Maybe Text -- ^ Section or custom anchor tag. , rPos :: Position - -- ^ Whether to check bad copy/paste for this link , rCheckCopyPaste :: Bool + -- ^ Whether to check bad copy/paste for this link } deriving stock (Show, Generic) makeLensesWith postfixFields ''Reference diff --git a/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs b/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs index c905bb5c..a2cb14a6 100644 --- a/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs +++ b/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs @@ -18,7 +18,8 @@ import Xrefcheck.Scanners.Markdown test_ignoreAnnotations :: [TestTree] test_ignoreAnnotations = - [ testGroup "Parsing failures" + [ testGroup "Parsing failures" $ + [ testGroup "Ignore annotations" [ testCase "Check if broken link annotation produce error" do let file = "tests/markdowns/with-annotations/no_link.md" errs <- getErrs file @@ -31,35 +32,71 @@ test_ignoreAnnotations = let file = "tests/markdowns/with-annotations/unexpected_ignore_file.md" errs <- getErrs file errs @?= makeError (Just $ PosInfo 9 1 9 29) file FileErr - , testCase "Check if broken unrecognised annotation produce error" do + ] + , testGroup "Ignore copypaste check annotations" + [ testCase "Check if broken copypaste link annotation produce error" do + let file = "tests/markdowns/with-annotations/no_link_cpc.md" + errs <- getErrs file + errs @?= makeError (Just $ PosInfo 7 1 7 48) file LinkErrCpc + , testCase "Check if broken copypaste paragraph annotation produce error" do + let file = "tests/markdowns/with-annotations/no_paragraph_cpc.md" + errs <- getErrs file + errs @?= makeError (Just $ PosInfo 7 1 7 53) file (ParagraphErrCpc "HEADING") + , testCase "Check if broken copypaste ignore file annotation produce error" do + let file = "tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md" + errs <- getErrs file + errs @?= makeError (Just $ PosInfo 9 1 9 47) file FileErrCpc + ] + , testCase "Check if broken unrecognised annotation produce error" do let file = "tests/markdowns/with-annotations/unrecognised_option.md" errs <- getErrs file errs @?= makeError (Just $ PosInfo 7 1 7 46) file (UnrecognisedErr "ignore unrecognised-option") - ] - , testGroup "\"ignore link\" mode" - [ testCase "Check \"ignore link\" performance" $ do - let file = "tests/markdowns/with-annotations/ignore_link.md" - (fi, errs) <- parse GitHub file - getRefs fi @?= - ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"] - errs @?= makeError (Just $ PosInfo 42 1 42 31) file LinkErr - ] - , testGroup "\"ignore paragraph\" mode" - [ testCase "Check \"ignore paragraph\" performance" $ do - (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph.md" - getRefs fi @?= ["blog", "contacts"] - errs @?= [] - ] - , testGroup "\"ignore all\" mode" - [ testCase "Check \"ignore all\" performance" $ do - (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file.md" - getRefs fi @?= [] - errs @?= [] - ] + ] + , testGroup "Check ignore pragmas" $ + [ testGroup "\"ignore link\" mode" + [ testCase "Check \"ignore link\" performance" $ do + let file = "tests/markdowns/with-annotations/ignore_link.md" + (fi, errs) <- parse GitHub file + getRefs fi @?= + ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"] + errs @?= makeError (Just $ PosInfo 42 1 42 31) file LinkErr + ] + , testGroup "\"ignore paragraph\" mode" + [ testCase "Check \"ignore paragraph\" performance" $ do + (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph.md" + getRefs fi @?= ["blog", "contacts"] + errs @?= [] + ] + , testGroup "\"ignore all\" mode" + [ testCase "Check \"ignore all\" performance" $ do + (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file.md" + getRefs fi @?= [] + errs @?= [] + ] + ] + , testGroup "Check ignore copypaste check pragmas" $ + [ testCase "Check ignore duplication check for link pragmas" $ do + let file = "tests/markdowns/with-annotations/ignore_link_cpc.md" + (fi, errs) <- parse GitHub file + getRefsWithCpc fi @?= + ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"] + errs @?= makeError (Just $ PosInfo 42 1 42 48) file LinkErrCpc + , testCase "Check ignore copypaste check for paragraph pragmas" $ do + (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph_cpc.md" + getRefsWithCpc fi @?= ["blog", "contacts"] + errs @?= [] + , testCase "Check ignore copypaste check in file performance" $ do + (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file_cpc.md" + fi ^. fiCopyPasteCheck @?= False + errs @?= [] + ] ] where getRefs :: FileInfo -> [Text] getRefs fi = map rName $ fi ^. fiReferences + getRefsWithCpc :: FileInfo -> [Text] + getRefsWithCpc fi = map rName $ filter rCheckCopyPaste $ fi ^. fiReferences + getErrs :: FilePath -> IO [ScanError] getErrs path = snd <$> parse GitHub path diff --git a/tests/markdowns/with-annotations/ignore_file_cpc.md b/tests/markdowns/with-annotations/ignore_file_cpc.md new file mode 100644 index 00000000..691eadac --- /dev/null +++ b/tests/markdowns/with-annotations/ignore_file_cpc.md @@ -0,0 +1,19 @@ + + + + + + + +Serokell [web-site](https://serokell.io/) +Serokell [team](https://serokell.io/team) + +Serokell [blog](https://serokell.io/blog) + +Serokell [labs](https://serokell.io/labs) + +Serokell [contacts](https://serokell.io/contacts) diff --git a/tests/markdowns/with-annotations/ignore_link_cpc.md b/tests/markdowns/with-annotations/ignore_link_cpc.md new file mode 100644 index 00000000..b4e6e231 --- /dev/null +++ b/tests/markdowns/with-annotations/ignore_link_cpc.md @@ -0,0 +1,49 @@ + + +### Do not check the first link in the paragraph + + +Serokell [web-site](https://serokell.io/) +Serokell [team](https://serokell.io/team) + + + +Serokell [blog](https://serokell.io/blog) + +Serokell [labs](https://serokell.io/labs) + +Serokell +[contacts](https://serokell.io/contacts) and again +[team](https://serokell.io/team) + +### Do not check not the first link in the paragraph + +[team](https://serokell.io/team) again and [projects](https://serokell.io/projects) + +Also [hire-us](https://serokell.io/hire-us) and +[fintech](https://serokell.io/fintech-development) +development + +Here are [how-we-work](https://serokell.io/how-we-work) and [privacy](https://serokell.io/privacy) +and [ml consulting](https://serokell.io/machine-learning-consulting) + + +Do not check link bug _regression test_ [link1](link1) [link2](link2) + + +Another no duplication check in link bug _some [link1](link1) emphasis_ [link2](link2) + +### Do not check pragma should be followed by + + + +This annotation expects link in paragraph right after it. + +So [link3](link3) is not checked for copypaste. + +Annotation inside paragraph allows +softbreaks and __other *things*__ in paragraph, so [link4](link4) is checked for copypaste. diff --git a/tests/markdowns/with-annotations/ignore_paragraph_cpc.md b/tests/markdowns/with-annotations/ignore_paragraph_cpc.md new file mode 100644 index 00000000..633cc96e --- /dev/null +++ b/tests/markdowns/with-annotations/ignore_paragraph_cpc.md @@ -0,0 +1,16 @@ + + + +Serokell [web-site](https://serokell.io/) +Serokell [team](https://serokell.io/team) + +Serokell [blog](https://serokell.io/blog) + + +Serokell [labs](https://serokell.io/labs) + +Serokell [contacts](https://serokell.io/contacts) diff --git a/tests/markdowns/with-annotations/no_link_cpc.md b/tests/markdowns/with-annotations/no_link_cpc.md new file mode 100644 index 00000000..e1671e52 --- /dev/null +++ b/tests/markdowns/with-annotations/no_link_cpc.md @@ -0,0 +1,8 @@ + + + +not a link diff --git a/tests/markdowns/with-annotations/no_paragraph_cpc.md b/tests/markdowns/with-annotations/no_paragraph_cpc.md new file mode 100644 index 00000000..03967d47 --- /dev/null +++ b/tests/markdowns/with-annotations/no_paragraph_cpc.md @@ -0,0 +1,9 @@ + + + + +# not a paragraph diff --git a/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md b/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md new file mode 100644 index 00000000..70981ebe --- /dev/null +++ b/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md @@ -0,0 +1,11 @@ + + +the first paragraph + + + +the second paragraph From 167ec4b915b39c7f1c7aaada40c2aaa977020d0e Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Thu, 15 Dec 2022 20:43:35 +0500 Subject: [PATCH 7/8] CHANGES.md, README.md --- CHANGES.md | 4 +++- README.md | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index a4d037eb..5988548b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -36,11 +36,13 @@ Unreleased + Now we call references to anchors in current file (e.g. `[a](#b)`) as `file-local` references instead of calling them `current file` (which was ambiguous). * [#233](https://github.com/serokell/xrefcheck/pull/233) - + Now xrefxcheck does not follow redirect links by default. It fails for permanent + + Now xrefcheck does not follow redirect links by default. It fails for permanent redirect responses (i.e. 301 and 308) and passes for temporary ones (i.e. 302, 303, 307). * [#231](https://github.com/serokell/xrefcheck/pull/231) + Anchor analysis takes now into account the appropriate case-sensitivity depending on the configured Markdown flavour. +* [240](https://github.com/serokell/xrefcheck/pull/240) + + Now xrefcheck is able to detect possible copy-pastes relying on links and their names. 0.2.2 ========== diff --git a/README.md b/README.md index 73d76c56..9bfd5c35 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ Comparing to alternative solutions, this tool tries to achieve the following poi * Supports external links (`http`, `https`, `ftp` and `ftps`). * Detects broken and ambiguous anchors in local links. * Integration with GitHub Actions. +* Detects possible bad copy-pastes of links. ## Dependencies [↑](#xrefcheck) @@ -148,6 +149,21 @@ There are several ways to fix this: * By default, `xrefcheck` will ignore links to localhost. * This behavior can be disabled by removing the corresponding entry from the `ignoreExternalRefsTo` list in the config file. +1. How do I disable copy-paste check for specific links? + * Add a `` annotation before the link: + ```md + + Links with bad copypaste: + [good link](https://good.link.uri/). + [copypasted link](https://good.link.uri/). + ``` + ```md + A [good link](https://good.link.uri/) + followed by an [copypasted intentionally](https://good.link.uri/). + ``` + * You can use a `` annotation to disable copy-paste check in a paragraph. + * You can use a `` annotation to disable copy-paste check within an entire file. + ## Further work [↑](#xrefcheck) - [ ] Support link detection in different languages, not only Markdown. From 03453e86d550398b8a068c1e5a60f8b9fad66832 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Thu, 15 Dec 2022 20:57:36 +0500 Subject: [PATCH 8/8] fixup! [#64] Implement copy/paste protection checks Remove extra parameters in md scanner --- src/Xrefcheck/Scanners/Markdown.hs | 35 ++++++++++-------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 6a439f8c..077b38eb 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -196,25 +196,23 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process use ssIgnoreCopyPasteCheck >>= \ignCPC -> do -- When no `Ignore` state is set check next node for annotation, -- if found then set it as new `IgnoreMode` otherwise skip node. + let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC + let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs case getAnnotation node of Just ann -> handleAnnotation pos ty ann Nothing -> do case ty of - PARAGRAPH -> handleParagraph ign ignCPC pos ty subs - LINK {} -> handleLink ign ignCPC pos ty subs - IMAGE {} -> handleLink ign ignCPC pos ty subs - _ -> handleOther ign ignCPC pos ty subs + PARAGRAPH -> handleParagraph ign traverseChildren + LINK {} -> handleLink ign ty traverseChildren + IMAGE {} -> handleLink ign ty traverseChildren + _ -> handleOther ign ty traverseChildren handleLink :: Maybe Ignore -> - Maybe Ignore -> - Maybe PosInfo -> NodeType -> - [ScannerM NodeCPC] -> + ScannerM NodeCPC -> ScannerM NodeCPC - handleLink ign ignCPC pos ty subs = do - let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC - let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs + handleLink ign ty traverseChildren = do -- It's common for all ignore states ssIgnore .= Nothing -- If there was a copy/paste ignore annotation that expected link, @@ -234,14 +232,9 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process handleParagraph :: Maybe Ignore -> - Maybe Ignore -> - Maybe PosInfo -> - NodeType -> - [ScannerM NodeCPC] -> + ScannerM NodeCPC -> ScannerM NodeCPC - handleParagraph ign ignCPC pos ty subs = do - let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC - let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs + handleParagraph ign traverseChildren = do -- If a new paragraph was expected (this stands for True), now we -- don't expect paragraphs any more. ssParagraphExpectedAfterCpcAnnotation .= False @@ -272,14 +265,10 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process handleOther :: Maybe Ignore -> - Maybe Ignore -> - Maybe PosInfo -> NodeType -> - [ScannerM NodeCPC] -> + ScannerM NodeCPC -> ScannerM NodeCPC - handleOther ign ignCPC pos ty subs = do - let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC - let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs + handleOther ign ty traverseChildren = do -- If right now there was a copy/paste ignore annotation for paragraph, -- emit an error and reset these states. reportExpectedParagraphAfterIgnoreCpcAnnotation ty