diff --git a/CHANGES.md b/CHANGES.md
index a4d037eb..5988548b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -36,11 +36,13 @@ Unreleased
+ Now we call references to anchors in current file (e.g. `[a](#b)`) as
`file-local` references instead of calling them `current file` (which was ambiguous).
* [#233](https://github.com/serokell/xrefcheck/pull/233)
- + Now xrefxcheck does not follow redirect links by default. It fails for permanent
+ + Now xrefcheck does not follow redirect links by default. It fails for permanent
redirect responses (i.e. 301 and 308) and passes for temporary ones (i.e. 302, 303, 307).
* [#231](https://github.com/serokell/xrefcheck/pull/231)
+ Anchor analysis takes now into account the appropriate case-sensitivity depending on
the configured Markdown flavour.
+* [240](https://github.com/serokell/xrefcheck/pull/240)
+ + Now xrefcheck is able to detect possible copy-pastes relying on links and their names.
0.2.2
==========
diff --git a/README.md b/README.md
index 73d76c56..9bfd5c35 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ Comparing to alternative solutions, this tool tries to achieve the following poi
* Supports external links (`http`, `https`, `ftp` and `ftps`).
* Detects broken and ambiguous anchors in local links.
* Integration with GitHub Actions.
+* Detects possible bad copy-pastes of links.
## Dependencies [↑](#xrefcheck)
@@ -148,6 +149,21 @@ There are several ways to fix this:
* By default, `xrefcheck` will ignore links to localhost.
* This behavior can be disabled by removing the corresponding entry from the `ignoreExternalRefsTo` list in the config file.
+1. How do I disable copy-paste check for specific links?
+ * Add a `` annotation before the link:
+ ```md
+
+ Links with bad copypaste:
+ [good link](https://good.link.uri/).
+ [copypasted link](https://good.link.uri/).
+ ```
+ ```md
+ A [good link](https://good.link.uri/)
+ followed by an [copypasted intentionally](https://good.link.uri/).
+ ```
+ * You can use a `` annotation to disable copy-paste check in a paragraph.
+ * You can use a `` annotation to disable copy-paste check within an entire file.
+
## Further work [↑](#xrefcheck)
- [ ] Support link detection in different languages, not only Markdown.
diff --git a/src/Xrefcheck/Command.hs b/src/Xrefcheck/Command.hs
index 16620a86..12ee272e 100644
--- a/src/Xrefcheck/Command.hs
+++ b/src/Xrefcheck/Command.hs
@@ -28,7 +28,7 @@ import Xrefcheck.Scan
import Xrefcheck.Scanners.Markdown (markdownSupport)
import Xrefcheck.System (askWithinCI)
import Xrefcheck.Util
-import Xrefcheck.Verify (reportVerifyErrs, verifyErrors, verifyRepo)
+import Xrefcheck.Verify (reportCopyPasteErrors, reportVerifyErrs, verifyErrors, verifyRepo)
readConfig :: FilePath -> IO Config
readConfig path = fmap (normaliseConfigFilePaths . overrideConfig) do
@@ -81,11 +81,12 @@ defaultAction Options{..} = do
whenJust (nonEmpty $ sortBy (compare `on` seFile) scanErrs) $ reportScanErrs
- verifyRes <- allowRewrite showProgressBar $ \rw -> do
+ (verifyRes, copyPasteErrors) <- allowRewrite showProgressBar $ \rw -> do
let fullConfig = config
{ cNetworking = addNetworkingOptions (cNetworking config) oNetworkingOptions }
verifyRepo rw fullConfig oMode oRoot repoInfo
+ whenJust (nonEmpty copyPasteErrors) reportCopyPasteErrors
case verifyErrors verifyRes of
Nothing | null scanErrs -> fmtLn "All repository links are valid."
Nothing -> exitFailure
diff --git a/src/Xrefcheck/Config.hs b/src/Xrefcheck/Config.hs
index 1496167b..a0d2ff4b 100644
--- a/src/Xrefcheck/Config.hs
+++ b/src/Xrefcheck/Config.hs
@@ -73,6 +73,8 @@ data ScannersConfig' f = ScannersConfig
, scAnchorSimilarityThreshold :: Field f Double
-- ^ On 'anchor not found' error, how much similar anchors should be displayed as
-- hint. Number should be between 0 and 1, larger value means stricter filter.
+ , scCopyPasteCheckEnabled :: Field f Bool
+ -- ^ Whether copy-paste check is enabled globally.
} deriving stock (Generic)
makeLensesWith postfixFields ''Config'
@@ -94,6 +96,9 @@ overrideConfig config
, scAnchorSimilarityThreshold =
fromMaybe (scAnchorSimilarityThreshold defScanners)
$ scAnchorSimilarityThreshold (cScanners config)
+ , scCopyPasteCheckEnabled =
+ fromMaybe (scCopyPasteCheckEnabled defScanners)
+ $ scCopyPasteCheckEnabled (cScanners config)
}
}
where
diff --git a/src/Xrefcheck/Config/Default.hs b/src/Xrefcheck/Config/Default.hs
index 72bcd1a2..8550b0bd 100644
--- a/src/Xrefcheck/Config/Default.hs
+++ b/src/Xrefcheck/Config/Default.hs
@@ -67,6 +67,9 @@ scanners:
#
# This affects which anchors are generated for headers.
flavor: #s{flavor}
+
+ # Whether copy-paste check is enabled globally.
+ copyPasteCheckEnabled: True
|]
where
ignoreLocalRefsFrom :: NonEmpty Text
diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs
index 9c3ae450..18e02114 100644
--- a/src/Xrefcheck/Core.hs
+++ b/src/Xrefcheck/Core.hs
@@ -12,6 +12,7 @@ module Xrefcheck.Core where
import Universum
import Control.Lens (makeLenses)
+import Control.Lens.Combinators (makeLensesWith)
import Data.Aeson (FromJSON (..), withText)
import Data.Char (isAlphaNum)
import Data.Char qualified as C
@@ -70,14 +71,17 @@ instance Given ColorMode => Buildable Position where
-- | Full info about a reference.
data Reference = Reference
- { rName :: Text
+ { rName :: Text
-- ^ Text displayed as reference.
- , rLink :: Text
+ , rLink :: Text
-- ^ File or site reference points to.
- , rAnchor :: Maybe Text
+ , rAnchor :: Maybe Text
-- ^ Section or custom anchor tag.
- , rPos :: Position
+ , rPos :: Position
+ , rCheckCopyPaste :: Bool
+ -- ^ Whether to check bad copy/paste for this link
} deriving stock (Show, Generic)
+makeLensesWith postfixFields ''Reference
-- | Context of anchor.
data AnchorType
@@ -102,9 +106,9 @@ data FileInfoDiff = FileInfoDiff
}
makeLenses ''FileInfoDiff
-diffToFileInfo :: FileInfoDiff -> FileInfo
-diffToFileInfo (FileInfoDiff refs anchors) =
- FileInfo (DList.toList refs) (DList.toList anchors)
+diffToFileInfo :: Bool -> FileInfoDiff -> FileInfo
+diffToFileInfo ignoreCpcInFile (FileInfoDiff refs anchors) =
+ FileInfo (DList.toList refs) (DList.toList anchors) ignoreCpcInFile
instance Semigroup FileInfoDiff where
FileInfoDiff a b <> FileInfoDiff c d = FileInfoDiff (a <> c) (b <> d)
@@ -114,13 +118,14 @@ instance Monoid FileInfoDiff where
-- | All information regarding a single file we care about.
data FileInfo = FileInfo
- { _fiReferences :: [Reference]
- , _fiAnchors :: [Anchor]
+ { _fiReferences :: [Reference]
+ , _fiAnchors :: [Anchor]
+ , _fiCopyPasteCheck :: Bool
} deriving stock (Show, Generic)
makeLenses ''FileInfo
instance Default FileInfo where
- def = diffToFileInfo mempty
+ def = diffToFileInfo True mempty
data ScanPolicy
= OnlyTracked
diff --git a/src/Xrefcheck/Scan.hs b/src/Xrefcheck/Scan.hs
index da312a0e..91270a14 100644
--- a/src/Xrefcheck/Scan.hs
+++ b/src/Xrefcheck/Scan.hs
@@ -117,18 +117,27 @@ data ScanErrorDescription
= LinkErr
| FileErr
| ParagraphErr Text
+ | LinkErrCpc
+ | FileErrCpc
+ | ParagraphErrCpc Text
| UnrecognisedErr Text
deriving stock (Show, Eq)
instance Buildable ScanErrorDescription where
build = \case
LinkErr -> [int||Expected a LINK after "ignore link" annotation|]
+ LinkErrCpc -> [int||Expected a LINK after "no duplication check in link" annotation|]
FileErr -> [int||Annotation "ignore all" must be at the top of \
markdown or right after comments at the top|]
+ FileErrCpc -> [int||Annotation "no duplication check in file" must be at the top of \
+ markdown or right after comments at the top|]
ParagraphErr txt -> [int||Expected a PARAGRAPH after \
"ignore paragraph" annotation, but found #{txt}|]
- UnrecognisedErr txt -> [int||Unrecognised option "#{txt}" perhaps you meant \
- <"ignore link"|"ignore paragraph"|"ignore all">|]
+ ParagraphErrCpc txt -> [int||Expected a PARAGRAPH after \
+ "no duplication check in paragraph" annotation, but found #{txt}|]
+ UnrecognisedErr txt -> [int||Unrecognised option "#{txt}", perhaps you meant
+ <"ignore link"|"ignore paragraph"|"ignore all">
+ or "no duplication check in "?|]
specificFormatsSupport :: [([Extension], ScanAction)] -> FormatsSupport
specificFormatsSupport formats = \ext -> M.lookup ext formatsMap
diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs
index 8a053cbf..077b38eb 100644
--- a/src/Xrefcheck/Scanners/Markdown.hs
+++ b/src/Xrefcheck/Scanners/Markdown.hs
@@ -54,6 +54,24 @@ instance Buildable C.Node where
build (C.Node _mpos ty mSubs) = nameF (show ty) $
maybe "[]" interpolateBlockListF (nonEmpty mSubs)
+data Node a = Node
+ { _ndPos :: Maybe PosInfo
+ , _ndType :: NodeType
+ , _ndInfo :: a
+ , _ndSubs :: [Node a]
+ }
+
+instance Buildable (Node a) where
+ build (Node _mpos ty _info mSubs) = nameF (show ty) $
+ maybe "[]" interpolateBlockListF (nonEmpty mSubs)
+
+-- Here and below CPC stands for "copy/paste check"
+type NodeCPC = Node CopyPasteCheck
+
+newtype CopyPasteCheck = CopyPasteCheck
+ { cpcShouldCheck :: Bool
+ } deriving stock (Show, Eq, Generic)
+
toPosition :: Maybe PosInfo -> Position
toPosition = Position . \case
Nothing -> Nothing
@@ -68,7 +86,7 @@ toPosition = Position . \case
|]
-- | Extract text from the topmost node.
-nodeExtractText :: (C.Node) -> Text
+nodeExtractText :: Node info -> Text
nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten
where
extractText = \case
@@ -76,8 +94,8 @@ nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten
CODE t -> t
_ -> ""
- nodeFlatten :: (C.Node) -> [NodeType]
- nodeFlatten (C.Node _pos ty subs) = ty : concatMap nodeFlatten subs
+ nodeFlatten :: Node info -> [NodeType]
+ nodeFlatten (Node _pos ty _info subs) = ty : concatMap nodeFlatten subs
data IgnoreMode
@@ -120,6 +138,7 @@ makeLensesFor [("_ignoreMode", "ignoreMode")] 'Ignore
data GetAnnotation
= IgnoreAnnotation IgnoreMode
+ | IgnoreCopyPasteCheck IgnoreMode
| InvalidAnnotation Text
deriving stock (Eq)
@@ -127,6 +146,8 @@ data GetAnnotation
data ScannerState = ScannerState
{ _ssIgnore :: Maybe Ignore
+ , _ssIgnoreCopyPasteCheck :: Maybe Ignore
+ , _ssParagraphExpectedAfterCpcAnnotation :: Bool
, _ssParentNodeType :: Maybe NodeType
-- ^ @cataNodeWithParentNodeInfo@ allows to get a @NodeType@ of parent node from this field
}
@@ -135,7 +156,9 @@ makeLenses ''ScannerState
initialScannerState :: ScannerState
initialScannerState = ScannerState
{ _ssIgnore = Nothing
+ , _ssIgnoreCopyPasteCheck = Nothing
, _ssParentNodeType = Nothing
+ , _ssParagraphExpectedAfterCpcAnnotation = False
}
type ScannerM a = StateT ScannerState (Writer [ScanError]) a
@@ -155,40 +178,49 @@ cataNodeWithParentNodeInfo f node = cataNode f' node
map (ssParentNodeType .= Just ty >>) childScanners
-- | Find ignore annotations (ignore paragraph and ignore link)
--- and remove nodes that should be ignored.
-processAnnotations :: FilePath -> C.Node -> Writer [ScanError] C.Node
+-- and remove nodes that should be ignored;
+-- find copy/paste check annotations (ignore for paragraph and for link)
+-- and label nodes with a boolean meaning whether they should be
+-- copy/paste checked.
+processAnnotations :: FilePath -> C.Node -> Writer [ScanError] NodeCPC
processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
where
process
:: Maybe PosInfo
-> NodeType
- -> [ScannerM C.Node]
- -> ScannerM C.Node
+ -> [ScannerM NodeCPC]
+ -> ScannerM NodeCPC
process pos ty subs = do
let node = C.Node pos ty []
- use ssIgnore >>= \ign -> do
+ use ssIgnore >>= \ign ->
+ use ssIgnoreCopyPasteCheck >>= \ignCPC -> do
-- When no `Ignore` state is set check next node for annotation,
-- if found then set it as new `IgnoreMode` otherwise skip node.
- let mbAnnotation = getAnnotation node
- case mbAnnotation of
+ let shouldCheckCPC = CopyPasteCheck $ isNothing ignCPC
+ let traverseChildren = Node pos ty shouldCheckCPC <$> sequence subs
+ case getAnnotation node of
Just ann -> handleAnnotation pos ty ann
Nothing -> do
case ty of
- PARAGRAPH -> handleParagraph ign pos ty subs
- LINK {} -> handleLink ign pos ty subs
- IMAGE {} -> handleLink ign pos ty subs
- _ -> handleOther ign pos ty subs
+ PARAGRAPH -> handleParagraph ign traverseChildren
+ LINK {} -> handleLink ign ty traverseChildren
+ IMAGE {} -> handleLink ign ty traverseChildren
+ _ -> handleOther ign ty traverseChildren
handleLink ::
Maybe Ignore ->
- Maybe PosInfo ->
NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- handleLink ign pos ty subs = do
- let traverseChildren = C.Node pos ty <$> sequence subs
- -- It can be checked that it's correct for all the cases
+ ScannerM NodeCPC ->
+ ScannerM NodeCPC
+ handleLink ign ty traverseChildren = do
+ -- It's common for all ignore states
ssIgnore .= Nothing
+ -- If there was a copy/paste ignore annotation that expected link,
+ -- reset this state
+ resetCpcIgnoreIfLink
+ -- If right now there was a copy/paste ignore annotation for paragraph,
+ -- emit an error and reset these states.
+ reportExpectedParagraphAfterIgnoreCpcAnnotation ty
case ign of
Nothing -> traverseChildren
@@ -200,73 +232,122 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
handleParagraph ::
Maybe Ignore ->
- Maybe PosInfo ->
- NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- handleParagraph ign pos ty subs = do
- let traverseChildren = C.Node pos ty <$> sequence subs
+ ScannerM NodeCPC ->
+ ScannerM NodeCPC
+ handleParagraph ign traverseChildren = do
+ -- If a new paragraph was expected (this stands for True), now we
+ -- don't expect paragraphs any more.
+ ssParagraphExpectedAfterCpcAnnotation .= False
node <- case ign of
- Nothing -> traverseChildren
+ Nothing ->
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
Just (Ignore IMSParagraph _) -> do
ssIgnore .= Nothing
pure defNode
Just (Ignore (IMSLink ignoreLinkState) modePos) ->
- traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs
+ wrapTraverseNodeWithLinkExpected ignoreLinkState modePos $
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
+
+ ssIgnoreCopyPasteCheck .= Nothing
use ssIgnore >>= \case
- Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) ->
+ Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do
lift $ tell $ makeError pragmaPos fp LinkErr
+ ssIgnore .= Nothing
+ _ -> pass
+ use ssIgnoreCopyPasteCheck >>= \case
+ Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> do
+ lift $ tell $ makeError pragmaPos fp LinkErrCpc
+ ssIgnoreCopyPasteCheck .= Nothing
_ -> pass
+
pure node
handleOther ::
Maybe Ignore ->
- Maybe PosInfo ->
NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- handleOther ign pos ty subs = do
- let traverseChildren = C.Node pos ty <$> sequence subs
+ ScannerM NodeCPC ->
+ ScannerM NodeCPC
+ handleOther ign ty traverseChildren = do
+ -- If right now there was a copy/paste ignore annotation for paragraph,
+ -- emit an error and reset these states.
+ reportExpectedParagraphAfterIgnoreCpcAnnotation ty
case ign of
- Nothing -> traverseChildren
+ Nothing ->
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
Just (Ignore IMSParagraph modePos) -> do
reportExpectedParagraphAfterIgnoreAnnotation modePos ty
ssIgnore .= Nothing
- traverseChildren
- Just (Ignore (IMSLink ignoreLinkState) modePos) -> do
- traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
+ Just (Ignore (IMSLink ignoreLinkState) modePos) ->
+ wrapTraverseNodeWithLinkExpected ignoreLinkState modePos $
+ wrapTraverseNodeWithLinkExpectedForCpc traverseChildren
reportExpectedParagraphAfterIgnoreAnnotation :: Maybe PosInfo -> NodeType -> ScannerM ()
reportExpectedParagraphAfterIgnoreAnnotation modePos ty =
lift . tell . makeError modePos fp . ParagraphErr $ prettyType ty
- traverseNodeWithLinkExpected ::
+ resetCpcIgnoreIfLink :: ScannerM ()
+ resetCpcIgnoreIfLink = do
+ curCpcIgnore <- use ssIgnoreCopyPasteCheck
+ case _ignoreMode <$> curCpcIgnore of
+ Just (IMSLink _) -> ssIgnoreCopyPasteCheck .= Nothing
+ _ -> pass
+
+ reportExpectedParagraphAfterIgnoreCpcAnnotation ::
+ NodeType -> ScannerM ()
+ reportExpectedParagraphAfterIgnoreCpcAnnotation ty =
+ use ssIgnoreCopyPasteCheck >>= \case
+ Just (Ignore IMSParagraph modePos) ->
+ whenM (use ssParagraphExpectedAfterCpcAnnotation) $ do
+ lift . tell . makeError modePos fp . ParagraphErrCpc $ prettyType ty
+ ssParagraphExpectedAfterCpcAnnotation .= False
+ ssIgnoreCopyPasteCheck .= Nothing
+ _ -> pass
+
+ wrapTraverseNodeWithLinkExpected ::
IgnoreLinkState ->
Maybe PosInfo ->
- Maybe PosInfo ->
- NodeType ->
- [ScannerM C.Node] ->
- ScannerM C.Node
- traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs = do
- when (ignoreLinkState == ExpectingLinkInSubnodes) $
+ ScannerM NodeCPC ->
+ ScannerM NodeCPC
+ wrapTraverseNodeWithLinkExpected ignoreLinkState modePos =
+ if ignoreLinkState /= ExpectingLinkInSubnodes
+ then id
+ else \traverse' -> do
ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink
- node' <- C.Node pos ty <$> sequence subs
- when (ignoreLinkState == ExpectingLinkInSubnodes) $ do
+ node' <- traverse'
currentIgnore <- use ssIgnore
case currentIgnore of
Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do
lift $ tell $ makeError modePos fp LinkErr
ssIgnore .= Nothing
_ -> pass
- return node'
+ return node'
+
+ wrapTraverseNodeWithLinkExpectedForCpc ::
+ ScannerM NodeCPC ->
+ ScannerM NodeCPC
+ wrapTraverseNodeWithLinkExpectedForCpc traverse' = do
+ ignoreCpc <- use ssIgnoreCopyPasteCheck
+ case ignoreCpc of
+ Just (Ignore (IMSLink ExpectingLinkInSubnodes) modePos) -> do
+ ssIgnoreCopyPasteCheck . _Just . ignoreMode .= IMSLink ParentExpectsLink
+ node' <- traverse'
+ currentIgnore <- use ssIgnoreCopyPasteCheck
+ case currentIgnore of
+ Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do
+ lift $ tell $ makeError modePos fp LinkErrCpc
+ ssIgnoreCopyPasteCheck .= Nothing
+ _ -> pass
+ return node'
+ _ -> traverse'
handleAnnotation
:: Maybe PosInfo
-> NodeType
-> GetAnnotation
- -> ScannerM C.Node
+ -> ScannerM NodeCPC
handleAnnotation pos nodeType = \case
IgnoreAnnotation mode -> do
let reportIfThereWasAnnotation :: ScannerM ()
@@ -300,6 +381,41 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
whenJust mbIgnoreModeState $ \ignoreModeState ->
(ssIgnore .= Just (Ignore ignoreModeState correctPos))
pure defNode
+ IgnoreCopyPasteCheck mode -> do
+ mbIgnoreModeState <- case mode of
+ IMLink -> use ssParentNodeType <&> Just . IMSLink . \case
+ Just PARAGRAPH -> ExpectingLinkInParagraph
+ _ -> ExpectingLinkInSubnodes
+
+ IMParagraph -> do
+ ssParagraphExpectedAfterCpcAnnotation .= True
+ pure $ Just IMSParagraph
+
+ -- We don't expect to find an `ignore all` annotation here,
+ -- since that annotation should be at the top of the file and
+ -- any correct annotations should be handled in `checkGlobalAnnotations`
+ -- function.
+ IMAll -> do
+ lift . tell $ makeError correctPos fp FileErrCpc
+ pure Nothing
+
+ whenJust mbIgnoreModeState $ \ignoreModeState -> do
+ let setupNewCpcState = ssIgnoreCopyPasteCheck .= Just (Ignore ignoreModeState correctPos)
+ use ssIgnoreCopyPasteCheck >>= \case
+ Nothing -> setupNewCpcState
+ Just (Ignore curIgn prevPos)
+ | IMSLink _ <- curIgn -> do
+ lift $ tell $ makeError prevPos fp LinkErrCpc
+ setupNewCpcState
+ | IMSParagraph <- curIgn -> case ignoreModeState of
+ IMSParagraph -> do
+ lift . tell . makeError prevPos fp . ParagraphErrCpc $ prettyType nodeType
+ setupNewCpcState
+ -- It's OK to have link annotation when paragraph is ignored
+ -- because in this case all links and all annotations are ignored.
+ _ -> pass
+ pure defNode
+
InvalidAnnotation msg -> do
lift . tell $ makeError correctPos fp $ UnrecognisedErr msg
pure defNode
@@ -312,8 +428,8 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
in fromMaybe "" mType
withIgnoreMode
- :: ScannerM C.Node
- -> Writer [ScanError] C.Node
+ :: ScannerM (Node info)
+ -> Writer [ScanError] (Node info)
withIgnoreMode action = action `runStateT` initialScannerState >>= \case
-- We expect `Ignore` state to be `Nothing` when we reach EOF,
-- otherwise that means there was an annotation that didn't match
@@ -328,8 +444,8 @@ processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process
(node, _) -> pure node
-- | Custom `foldMap` for source tree.
-foldNode :: (Monoid a, Monad m) => (C.Node -> m a) -> C.Node -> m a
-foldNode action node@(C.Node _ _ subs) = do
+foldNode :: (Monoid a, Monad m) => (Node info -> m a) -> Node info -> m a
+foldNode action node@(Node _ _ _ subs) = do
a <- action node
b <- concatForM subs (foldNode action)
return (a <> b)
@@ -342,16 +458,16 @@ nodeExtractInfo
-> C.Node
-> ExtractorM FileInfo
nodeExtractInfo fp (C.Node nPos nTy nSubs) = do
- let (ignoreFile, contentNodes) = checkGlobalAnnotations nSubs
+ let (ignoreFile, ignoreCpcInFile, contentNodes) = checkGlobalAnnotations nSubs
if ignoreFile
then return def
- else diffToFileInfo <$>
+ else diffToFileInfo (not ignoreCpcInFile) <$>
(lift (processAnnotations fp $ C.Node nPos nTy contentNodes)
>>= foldNode extractor)
where
- extractor :: C.Node -> ExtractorM FileInfoDiff
- extractor node@(C.Node pos ty _) =
+ extractor :: NodeCPC -> ExtractorM FileInfoDiff
+ extractor node@(Node pos ty info _) =
case ty of
HTML_BLOCK _ -> do
return mempty
@@ -401,15 +517,17 @@ nodeExtractInfo fp (C.Node nPos nTy nSubs) = do
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"
return $ FileInfoDiff
- (DList.singleton $ Reference {rName, rPos, rLink, rAnchor})
+ (DList.singleton $
+ Reference {rName, rPos, rLink, rAnchor, rCheckCopyPaste = cpcShouldCheck info})
DList.empty
-- | Check for global annotations, ignoring simple comments if there are any.
-checkGlobalAnnotations :: [C.Node] -> (Bool, [C.Node])
+checkGlobalAnnotations :: [C.Node] -> (Bool, Bool, [C.Node])
checkGlobalAnnotations nodes = do
let (headerNodes, contentsNodes) = span isHeaderNode nodes
ignoreFile = any isIgnoreFile headerNodes
- (ignoreFile, contentsNodes)
+ ignoreCpcInFile = any isIgnoreCpcWithinFile headerNodes
+ (ignoreFile, ignoreCpcInFile, contentsNodes)
where
isSimpleComment :: C.Node -> Bool
isSimpleComment node = do
@@ -420,15 +538,20 @@ checkGlobalAnnotations nodes = do
isIgnoreFile :: C.Node -> Bool
isIgnoreFile = (Just (IgnoreAnnotation IMAll) ==) . getAnnotation
+ isIgnoreCpcWithinFile :: C.Node -> Bool
+ isIgnoreCpcWithinFile = (Just (IgnoreCopyPasteCheck IMAll) ==) . getAnnotation
+
isHeaderNode :: C.Node -> Bool
isHeaderNode node =
any ($ node)
[ isSimpleComment
, isIgnoreFile
+ , isIgnoreCpcWithinFile
]
-defNode :: C.Node
-defNode = C.Node Nothing DOCUMENT [] -- hard-coded default Node
+-- | Hard-coded default Node
+defNode :: NodeCPC
+defNode = Node Nothing DOCUMENT (CopyPasteCheck False) []
makeError
:: Maybe PosInfo
@@ -473,6 +596,8 @@ textToMode :: Text -> GetAnnotation
textToMode annText = case wordsList of
("ignore" : [x])
| Just ignMode <- getIgnoreMode x -> IgnoreAnnotation ignMode
+ ("no" : "duplication" : "check" : "in" : [x])
+ | Just ignMode <- getIgnoreMode x -> IgnoreCopyPasteCheck ignMode
_ -> InvalidAnnotation annText
where
wordsList = words annText
@@ -482,6 +607,7 @@ getIgnoreMode = \case
"link" -> Just IMLink
"paragraph" -> Just IMParagraph
"all" -> Just IMAll
+ "file" -> Just IMAll
_ -> Nothing
parseFileInfo :: MarkdownConfig -> FilePath -> LT.Text -> (FileInfo, [ScanError])
diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs
index 1ed1fdbb..6980b737 100644
--- a/src/Xrefcheck/Verify.hs
+++ b/src/Xrefcheck/Verify.hs
@@ -28,7 +28,10 @@ module Xrefcheck.Verify
-- * URI parsing
, parseUri
+
+ -- * Reporting errors
, reportVerifyErrs
+ , reportCopyPasteErrors
) where
import Universum
@@ -37,9 +40,11 @@ import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync)
import Control.Exception (AsyncException (..), throwIO)
import Control.Monad.Except (MonadError (..))
import Data.ByteString qualified as BS
+import Data.Char (isAlphaNum)
import Data.List qualified as L
import Data.Map qualified as M
import Data.Reflection (Given)
+import Data.Text qualified as T
import Data.Text.Metrics (damerauLevenshteinNorm)
import Data.Time (UTCTime, defaultTimeLocale, formatTime, readPTime, rfc822DateFormat)
import Data.Time.Clock.POSIX (getPOSIXTime)
@@ -255,6 +260,21 @@ instance Given ColorMode => Buildable VerifyError where
#{redirectedUrl}
|]
+data CopyPasteCheckResult = CopyPasteCheckResult
+ { crFile :: FilePath,
+ crOriginalRef :: Reference,
+ crCopiedRef :: Reference
+ }
+
+instance (Given ColorMode) => Buildable CopyPasteCheckResult where
+ build CopyPasteCheckResult {..} =
+ [int||
+ In file #{styleIfNeeded Faint (styleIfNeeded Bold crFile)}
+ #{crCopiedRef}\
+ is possibly a bad copy paste of
+ #{crOriginalRef}
+ |]
+
reportVerifyErrs
:: Given ColorMode => NonEmpty (WithReferenceLoc VerifyError) -> IO ()
reportVerifyErrs errs = fmt
@@ -265,6 +285,17 @@ reportVerifyErrs errs = fmt
Invalid references dumped, #{length errs} in total.
|]
+reportCopyPasteErrors
+ :: Given ColorMode => NonEmpty CopyPasteCheckResult -> IO ()
+reportCopyPasteErrors errs = fmt
+ [int||
+ === Possible copy/paste errors ===
+
+ #{interpolateIndentF 2 (interpolateBlockListF' "➥ " build errs)}
+ Possible copy/paste errors dumped, #{length errs} in total.
+ |]
+
+
data RetryAfter = Date UTCTime | Seconds (Time Second)
deriving stock (Show, Eq)
@@ -355,7 +386,7 @@ verifyRepo
-> VerifyMode
-> FilePath
-> RepoInfo
- -> IO (VerifyResult $ WithReferenceLoc VerifyError)
+ -> IO (VerifyResult $ WithReferenceLoc VerifyError, [CopyPasteCheckResult])
verifyRepo
rw
config@Config{..}
@@ -363,24 +394,32 @@ verifyRepo
root
repoInfo'@(RepoInfo files _)
= do
- let toScan = do
- (file, fileInfo) <- M.toList files
+
+ let filesToScan = flip mapMaybe (M.toList files) $ \(file, fileInfo) -> do
guard . not $ matchesGlobPatterns root (ecIgnoreRefsFrom cExclusions) file
case fileInfo of
Scanned fi -> do
- ref <- _fiReferences fi
- return (file, ref)
- NotScannable -> empty -- No support for such file, can do nothing.
- NotAddedToGit -> empty -- If this file is scannable, we've notified
+ Just (file, fi)
+ NotScannable -> Nothing -- No support for such file, can do nothing.
+ NotAddedToGit -> Nothing -- If this file is scannable, we've notified
-- user that we are scanning only files
-- added to Git while gathering RepoInfo.
+ toCheckCopyPaste = map (second _fiReferences) $ filter (_fiCopyPasteCheck . snd) filesToScan
+ toScan = concatMap (\(file, fileInfo) -> map (file,) $ _fiReferences fileInfo) filesToScan
+ copyPasteErrors = if scCopyPasteCheckEnabled cScanners
+ then [ res
+ | (file, refs) <- toCheckCopyPaste,
+ res <- checkCopyPaste file refs
+ ]
+ else []
+
progressRef <- newIORef $ initVerifyProgress (map snd toScan)
accumulated <- loopAsyncUntil (printer progressRef) do
forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) ->
verifyReference config mode progressRef repoInfo' root file ref
- case accumulated of
+ (, copyPasteErrors) <$> case accumulated of
Right res -> return $ fold res
Left (exception, partialRes) -> do
-- The user has hit Ctrl+C; display any verification errors we managed to find and exit.
@@ -412,6 +451,41 @@ verifyRepo
ExternalLoc -> CacheUnderKey rLink
_ -> NoCaching
+checkCopyPaste :: FilePath -> [Reference] -> [CopyPasteCheckResult]
+checkCopyPaste file refs = do
+ let getLinkAndAnchor x = (rLink x, rAnchor x)
+ groupedRefs =
+ L.groupBy ((==) `on` getLinkAndAnchor) $
+ sortBy (compare `on` getLinkAndAnchor) $
+ filter rCheckCopyPaste refs
+ concatMap checkGroup groupedRefs
+ where
+ checkGroup :: [Reference] -> [CopyPasteCheckResult]
+ checkGroup refsInGroup = do
+ let mergeLinkAndAnchor ref = maybe (rLink ref) (rLink ref <>) $ rAnchor ref
+ let refsInGroup' = flip map refsInGroup $ \ref ->
+ (ref, (prepareNameForCheck $ rName ref,
+ prepareNameForCheck $ mergeLinkAndAnchor ref))
+ -- most of time this will be Nothing and we won't need `others`
+ let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup'
+ others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup'
+ maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef
+
+ textIsLinkSubstr :: (Text, Text) -> Bool
+ textIsLinkSubstr (prepName, prepLink) = prepName `isSubSeq` prepLink
+
+ prepareNameForCheck :: Text -> Text
+ prepareNameForCheck = T.toLower . T.filter isAlphaNum
+
+ isSubSeq :: Text -> Text -> Bool
+ isSubSeq "" _str = True
+ isSubSeq _que "" = False
+ isSubSeq que str
+ | qhead == shead = isSubSeq qtail stail
+ | otherwise = isSubSeq que stail
+ where (qhead, qtail) = T.splitAt 1 que
+ (shead, stail) = T.splitAt 1 str
+
shouldCheckLocType :: VerifyMode -> LocationType -> Bool
shouldCheckLocType mode locType
| isExternal locType = shouldCheckExternal mode
diff --git a/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs b/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs
index c905bb5c..a2cb14a6 100644
--- a/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs
+++ b/tests/Test/Xrefcheck/IgnoreAnnotationsSpec.hs
@@ -18,7 +18,8 @@ import Xrefcheck.Scanners.Markdown
test_ignoreAnnotations :: [TestTree]
test_ignoreAnnotations =
- [ testGroup "Parsing failures"
+ [ testGroup "Parsing failures" $
+ [ testGroup "Ignore annotations"
[ testCase "Check if broken link annotation produce error" do
let file = "tests/markdowns/with-annotations/no_link.md"
errs <- getErrs file
@@ -31,35 +32,71 @@ test_ignoreAnnotations =
let file = "tests/markdowns/with-annotations/unexpected_ignore_file.md"
errs <- getErrs file
errs @?= makeError (Just $ PosInfo 9 1 9 29) file FileErr
- , testCase "Check if broken unrecognised annotation produce error" do
+ ]
+ , testGroup "Ignore copypaste check annotations"
+ [ testCase "Check if broken copypaste link annotation produce error" do
+ let file = "tests/markdowns/with-annotations/no_link_cpc.md"
+ errs <- getErrs file
+ errs @?= makeError (Just $ PosInfo 7 1 7 48) file LinkErrCpc
+ , testCase "Check if broken copypaste paragraph annotation produce error" do
+ let file = "tests/markdowns/with-annotations/no_paragraph_cpc.md"
+ errs <- getErrs file
+ errs @?= makeError (Just $ PosInfo 7 1 7 53) file (ParagraphErrCpc "HEADING")
+ , testCase "Check if broken copypaste ignore file annotation produce error" do
+ let file = "tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md"
+ errs <- getErrs file
+ errs @?= makeError (Just $ PosInfo 9 1 9 47) file FileErrCpc
+ ]
+ , testCase "Check if broken unrecognised annotation produce error" do
let file = "tests/markdowns/with-annotations/unrecognised_option.md"
errs <- getErrs file
errs @?= makeError (Just $ PosInfo 7 1 7 46) file (UnrecognisedErr "ignore unrecognised-option")
- ]
- , testGroup "\"ignore link\" mode"
- [ testCase "Check \"ignore link\" performance" $ do
- let file = "tests/markdowns/with-annotations/ignore_link.md"
- (fi, errs) <- parse GitHub file
- getRefs fi @?=
- ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"]
- errs @?= makeError (Just $ PosInfo 42 1 42 31) file LinkErr
- ]
- , testGroup "\"ignore paragraph\" mode"
- [ testCase "Check \"ignore paragraph\" performance" $ do
- (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph.md"
- getRefs fi @?= ["blog", "contacts"]
- errs @?= []
- ]
- , testGroup "\"ignore all\" mode"
- [ testCase "Check \"ignore all\" performance" $ do
- (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file.md"
- getRefs fi @?= []
- errs @?= []
- ]
+ ]
+ , testGroup "Check ignore pragmas" $
+ [ testGroup "\"ignore link\" mode"
+ [ testCase "Check \"ignore link\" performance" $ do
+ let file = "tests/markdowns/with-annotations/ignore_link.md"
+ (fi, errs) <- parse GitHub file
+ getRefs fi @?=
+ ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"]
+ errs @?= makeError (Just $ PosInfo 42 1 42 31) file LinkErr
+ ]
+ , testGroup "\"ignore paragraph\" mode"
+ [ testCase "Check \"ignore paragraph\" performance" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph.md"
+ getRefs fi @?= ["blog", "contacts"]
+ errs @?= []
+ ]
+ , testGroup "\"ignore all\" mode"
+ [ testCase "Check \"ignore all\" performance" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file.md"
+ getRefs fi @?= []
+ errs @?= []
+ ]
+ ]
+ , testGroup "Check ignore copypaste check pragmas" $
+ [ testCase "Check ignore duplication check for link pragmas" $ do
+ let file = "tests/markdowns/with-annotations/ignore_link_cpc.md"
+ (fi, errs) <- parse GitHub file
+ getRefsWithCpc fi @?=
+ ["team", "team", "team", "hire-us", "how-we-work", "privacy", "link2", "link2", "link3"]
+ errs @?= makeError (Just $ PosInfo 42 1 42 48) file LinkErrCpc
+ , testCase "Check ignore copypaste check for paragraph pragmas" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_paragraph_cpc.md"
+ getRefsWithCpc fi @?= ["blog", "contacts"]
+ errs @?= []
+ , testCase "Check ignore copypaste check in file performance" $ do
+ (fi, errs) <- parse GitHub "tests/markdowns/with-annotations/ignore_file_cpc.md"
+ fi ^. fiCopyPasteCheck @?= False
+ errs @?= []
+ ]
]
where
getRefs :: FileInfo -> [Text]
getRefs fi = map rName $ fi ^. fiReferences
+ getRefsWithCpc :: FileInfo -> [Text]
+ getRefsWithCpc fi = map rName $ filter rCheckCopyPaste $ fi ^. fiReferences
+
getErrs :: FilePath -> IO [ScanError]
getErrs path = snd <$> parse GitHub path
diff --git a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs
index c3143086..31bd1d53 100644
--- a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs
+++ b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs
@@ -44,7 +44,7 @@ test_ignoreRegex = give WithoutColors $
verifyRes <- allowRewrite showProgressBar $ \rw ->
verifyRepo rw config verifyMode root $ srRepoInfo scanResult
- let brokenLinks = pickBrokenLinks verifyRes
+ let brokenLinks = pickBrokenLinks $ fst verifyRes
let matchedLinks =
[ "https://bad.referenc/"
diff --git a/tests/Test/Xrefcheck/TooManyRequestsSpec.hs b/tests/Test/Xrefcheck/TooManyRequestsSpec.hs
index 8b07d490..a61ae50e 100644
--- a/tests/Test/Xrefcheck/TooManyRequestsSpec.hs
+++ b/tests/Test/Xrefcheck/TooManyRequestsSpec.hs
@@ -63,7 +63,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
- (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
+ (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = ttTimeToCompletion <$> pTaskTimestamp
@@ -88,7 +88,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
- (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
+ (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = fromMaybe (sec 0) $ ttTimeToCompletion <$> pTaskTimestamp
@@ -114,7 +114,7 @@ test_tooManyRequests = testGroup "429 response tests"
}
}
_ <- verifyReferenceWithProgress
- (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing))
+ (Reference "" "http://127.0.0.1:5000/429" Nothing (Position Nothing) False)
progressRef
Progress{..} <- vrExternal <$> readIORef progressRef
let ttc = ttTimeToCompletion <$> pTaskTimestamp
diff --git a/tests/Test/Xrefcheck/UtilRequests.hs b/tests/Test/Xrefcheck/UtilRequests.hs
index 8582cd8c..1285a600 100644
--- a/tests/Test/Xrefcheck/UtilRequests.hs
+++ b/tests/Test/Xrefcheck/UtilRequests.hs
@@ -62,7 +62,7 @@ checkLinkAndProgressWithServer mock link progress vrExpectation =
verifyLink :: Text -> IO (VerifyResult VerifyError, Progress Int)
verifyLink link = do
- let reference = Reference "" link Nothing (Position Nothing)
+ let reference = Reference "" link Nothing (Position Nothing) False
progRef <- newIORef $ initVerifyProgress [reference]
result <- verifyReferenceWithProgress reference progRef
p <- readIORef progRef
diff --git a/tests/configs/github-config.yaml b/tests/configs/github-config.yaml
index 35e3a088..c0698528 100644
--- a/tests/configs/github-config.yaml
+++ b/tests/configs/github-config.yaml
@@ -56,3 +56,6 @@ scanners:
#
# This affects which anchors are generated for headers.
flavor: GitHub
+
+ # Whether copy-paste check is enabled globally.
+ copyPasteCheckEnabled: True
diff --git a/tests/golden/check-copy-paste/check-copy-paste.bats b/tests/golden/check-copy-paste/check-copy-paste.bats
new file mode 100644
index 00000000..45971b8a
--- /dev/null
+++ b/tests/golden/check-copy-paste/check-copy-paste.bats
@@ -0,0 +1,17 @@
+#!/usr/bin/env bats
+
+# SPDX-FileCopyrightText: 2022 Serokell
+#
+# SPDX-License-Identifier: MPL-2.0
+
+load '../helpers/bats-support/load'
+load '../helpers/bats-assert/load'
+load '../helpers/bats-file/load'
+load '../helpers'
+
+
+@test "Check possible copy-paste errors and copy-paste annotations " {
+ to_temp xrefcheck
+
+ assert_diff expected.gold
+}
diff --git a/tests/golden/check-copy-paste/expected.gold b/tests/golden/check-copy-paste/expected.gold
new file mode 100644
index 00000000..1077dd7d
--- /dev/null
+++ b/tests/golden/check-copy-paste/expected.gold
@@ -0,0 +1,72 @@
+=== Scan errors found ===
+
+ ➥ In file second-file.md
+ scan error at src:35:1-25:
+
+ Unrecognised option "no dh", perhaps you meant
+ <"ignore link"|"ignore paragraph"|"ignore all">
+ or "no duplication check in "?
+
+ ➥ In file second-file.md
+ scan error at src:40:1-53:
+
+ Expected a PARAGRAPH after "no duplication check in paragraph" annotation, but found HEADING
+
+ ➥ In file second-file.md
+ scan error at src:46:1-48:
+
+ Expected a LINK after "no duplication check in link" annotation
+
+ ➥ In file second-file.md
+ scan error at src:51:1-48:
+
+ Annotation "no duplication check in file" must be at the top of markdown or right after comments at the top
+
+Scan errors dumped, 4 in total.
+=== Possible copy/paste errors ===
+
+ ➥ In file second-file.md
+ reference (relative) at src:20:1-29:
+ - text: "Lol Kek"
+ - link: ./first-file.md
+ - anchor: -
+ is possibly a bad copy paste of
+ reference (relative) at src:10:1-34:
+ - text: "First file"
+ - link: ./first-file.md
+ - anchor: -
+
+ ➥ In file second-file.md
+ reference (relative) at src:21:1-30:
+ - text: "Baz quux"
+ - link: ./first-file.md
+ - anchor: -
+ is possibly a bad copy paste of
+ reference (relative) at src:10:1-34:
+ - text: "First file"
+ - link: ./first-file.md
+ - anchor: -
+
+ ➥ In file second-file.md
+ reference (relative) at src:31:1-29:
+ - text: "fdw"
+ - link: ./first-file.md
+ - anchor: chor
+ is possibly a bad copy paste of
+ reference (relative) at src:30:1-32:
+ - text: "ff-cho"
+ - link: ./first-file.md
+ - anchor: chor
+
+ ➥ In file second-file.md
+ reference (external) at src:70:1-28:
+ - text: "gitlab"
+ - link: https://github.com
+ - anchor: -
+ is possibly a bad copy paste of
+ reference (external) at src:69:1-28:
+ - text: "github"
+ - link: https://github.com
+ - anchor: -
+
+Possible copy/paste errors dumped, 4 in total.
diff --git a/tests/golden/check-copy-paste/first-file.md b/tests/golden/check-copy-paste/first-file.md
new file mode 100644
index 00000000..2b4cf38c
--- /dev/null
+++ b/tests/golden/check-copy-paste/first-file.md
@@ -0,0 +1,18 @@
+
+
+
+
+
+[ Second - ---file- ](./second-file.md)
+[ Link 2](./second-file.md)
+
+# heading
+
+# anch
+
+# chor
diff --git a/tests/golden/check-copy-paste/second-file.md b/tests/golden/check-copy-paste/second-file.md
new file mode 100644
index 00000000..43297e4a
--- /dev/null
+++ b/tests/golden/check-copy-paste/second-file.md
@@ -0,0 +1,75 @@
+
+
+
+
+[ First file ](./first-file.md)
+
+
+
+[ Link 2](./first-file.md)
+
+
+[ Link 3](./first-file.md#heading)
+
+
+[ Lol Kek](./first-file.md)
+[ Baz quux](./first-file.md)
+
+
+
+[ asd](./first-file.md#anch)
+[ fdw](./first-file.md#anch)
+
+
+
+[ ff-cho](./first-file.md#chor)
+[ fdw](./first-file.md#chor)
+
+
+
+
+
+
+
+
+
+
+# asd
+
+
+
+
+
+# asd
+
+
+
+
+
+
+
+[ Link 3](./first-file.md)
+
+
+
+
+hello, how are you, bye
+
+
+
+[github](https://github.com)
+[gitlab](https://github.com)
+
+
+[github](https://github.com)
+[gitlab](https://github.com)
+
+
+
+[github](https://github.com)
+[gitlab](https://github.com)
diff --git a/tests/golden/check-scan-errors/expected.gold b/tests/golden/check-scan-errors/expected.gold
index a1933ffe..ca734335 100644
--- a/tests/golden/check-scan-errors/expected.gold
+++ b/tests/golden/check-scan-errors/expected.gold
@@ -18,7 +18,9 @@
➥ In file check-scan-errors.md
scan error at src:21:1-50:
- Unrecognised option "ignore unrecognised-annotation" perhaps you meant <"ignore link"|"ignore paragraph"|"ignore all">
+ Unrecognised option "ignore unrecognised-annotation", perhaps you meant
+ <"ignore link"|"ignore paragraph"|"ignore all">
+ or "no duplication check in "?
➥ In file check-second-file.md
scan error at src:9:1-29:
diff --git a/tests/markdowns/with-annotations/ignore_file_cpc.md b/tests/markdowns/with-annotations/ignore_file_cpc.md
new file mode 100644
index 00000000..691eadac
--- /dev/null
+++ b/tests/markdowns/with-annotations/ignore_file_cpc.md
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+Serokell [web-site](https://serokell.io/)
+Serokell [team](https://serokell.io/team)
+
+Serokell [blog](https://serokell.io/blog)
+
+Serokell [labs](https://serokell.io/labs)
+
+Serokell [contacts](https://serokell.io/contacts)
diff --git a/tests/markdowns/with-annotations/ignore_link_cpc.md b/tests/markdowns/with-annotations/ignore_link_cpc.md
new file mode 100644
index 00000000..b4e6e231
--- /dev/null
+++ b/tests/markdowns/with-annotations/ignore_link_cpc.md
@@ -0,0 +1,49 @@
+
+
+### Do not check the first link in the paragraph
+
+
+Serokell [web-site](https://serokell.io/)
+Serokell [team](https://serokell.io/team)
+
+
+
+Serokell [blog](https://serokell.io/blog)
+
+Serokell [labs](https://serokell.io/labs)
+
+Serokell
+[contacts](https://serokell.io/contacts) and again
+[team](https://serokell.io/team)
+
+### Do not check not the first link in the paragraph
+
+[team](https://serokell.io/team) again and [projects](https://serokell.io/projects)
+
+Also [hire-us](https://serokell.io/hire-us) and
+[fintech](https://serokell.io/fintech-development)
+development
+
+Here are [how-we-work](https://serokell.io/how-we-work) and [privacy](https://serokell.io/privacy)
+and [ml consulting](https://serokell.io/machine-learning-consulting)
+
+
+Do not check link bug _regression test_ [link1](link1) [link2](link2)
+
+
+Another no duplication check in link bug _some [link1](link1) emphasis_ [link2](link2)
+
+### Do not check pragma should be followed by
+
+
+
+This annotation expects link in paragraph right after it.
+
+So [link3](link3) is not checked for copypaste.
+
+Annotation inside paragraph allows
+softbreaks and __other *things*__ in paragraph, so [link4](link4) is checked for copypaste.
diff --git a/tests/markdowns/with-annotations/ignore_paragraph_cpc.md b/tests/markdowns/with-annotations/ignore_paragraph_cpc.md
new file mode 100644
index 00000000..633cc96e
--- /dev/null
+++ b/tests/markdowns/with-annotations/ignore_paragraph_cpc.md
@@ -0,0 +1,16 @@
+
+
+
+Serokell [web-site](https://serokell.io/)
+Serokell [team](https://serokell.io/team)
+
+Serokell [blog](https://serokell.io/blog)
+
+
+Serokell [labs](https://serokell.io/labs)
+
+Serokell [contacts](https://serokell.io/contacts)
diff --git a/tests/markdowns/with-annotations/no_link_cpc.md b/tests/markdowns/with-annotations/no_link_cpc.md
new file mode 100644
index 00000000..e1671e52
--- /dev/null
+++ b/tests/markdowns/with-annotations/no_link_cpc.md
@@ -0,0 +1,8 @@
+
+
+
+not a link
diff --git a/tests/markdowns/with-annotations/no_paragraph_cpc.md b/tests/markdowns/with-annotations/no_paragraph_cpc.md
new file mode 100644
index 00000000..03967d47
--- /dev/null
+++ b/tests/markdowns/with-annotations/no_paragraph_cpc.md
@@ -0,0 +1,9 @@
+
+
+
+
+# not a paragraph
diff --git a/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md b/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md
new file mode 100644
index 00000000..70981ebe
--- /dev/null
+++ b/tests/markdowns/with-annotations/unexpected_ignore_file_cpc.md
@@ -0,0 +1,11 @@
+
+
+the first paragraph
+
+
+
+the second paragraph