From 44f21e50cb5939da7af61860bcff1861ab54daff Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Tue, 13 Dec 2022 13:49:52 +0500 Subject: [PATCH 1/5] [#64] Implement copy/paste protection checks Problem: Currently xrefcheck is not able to detect possible bad copy-pastes, when some links are referring the same file, but from the link names it seems that one of those links should refer other file. Solution: Implement check, add corresponding settings to the config. --- src/Xrefcheck/Command.hs | 5 +- src/Xrefcheck/Config.hs | 5 + src/Xrefcheck/Config/Default.hs | 3 + src/Xrefcheck/Core.hs | 4 +- src/Xrefcheck/Verify.hs | 102 +++++++++++++++--- tests/Test/Xrefcheck/CopyPasteCheckSpec.hs | 65 +++++++++++ tests/Test/Xrefcheck/IgnoreRegexSpec.hs | 2 +- tests/configs/github-config.yaml | 3 + .../check-copy-paste/check-copy-paste.bats | 17 +++ tests/golden/check-copy-paste/expected.gold | 48 +++++++++ tests/golden/check-copy-paste/first-file.md | 11 ++ tests/golden/check-copy-paste/second-file.md | 29 +++++ 12 files changed, 277 insertions(+), 17 deletions(-) create mode 100644 tests/Test/Xrefcheck/CopyPasteCheckSpec.hs create mode 100644 tests/golden/check-copy-paste/check-copy-paste.bats create mode 100644 tests/golden/check-copy-paste/expected.gold create mode 100644 tests/golden/check-copy-paste/first-file.md create mode 100644 tests/golden/check-copy-paste/second-file.md diff --git a/src/Xrefcheck/Command.hs b/src/Xrefcheck/Command.hs index 16620a86..12ee272e 100644 --- a/src/Xrefcheck/Command.hs +++ b/src/Xrefcheck/Command.hs @@ -28,7 +28,7 @@ import Xrefcheck.Scan import Xrefcheck.Scanners.Markdown (markdownSupport) import Xrefcheck.System (askWithinCI) import Xrefcheck.Util -import Xrefcheck.Verify (reportVerifyErrs, verifyErrors, verifyRepo) +import Xrefcheck.Verify (reportCopyPasteErrors, reportVerifyErrs, verifyErrors, verifyRepo) readConfig :: FilePath -> IO Config readConfig path = fmap (normaliseConfigFilePaths . overrideConfig) do @@ -81,11 +81,12 @@ defaultAction Options{..} = do whenJust (nonEmpty $ sortBy (compare `on` seFile) scanErrs) $ reportScanErrs - verifyRes <- allowRewrite showProgressBar $ \rw -> do + (verifyRes, copyPasteErrors) <- allowRewrite showProgressBar $ \rw -> do let fullConfig = config { cNetworking = addNetworkingOptions (cNetworking config) oNetworkingOptions } verifyRepo rw fullConfig oMode oRoot repoInfo + whenJust (nonEmpty copyPasteErrors) reportCopyPasteErrors case verifyErrors verifyRes of Nothing | null scanErrs -> fmtLn "All repository links are valid." Nothing -> exitFailure diff --git a/src/Xrefcheck/Config.hs b/src/Xrefcheck/Config.hs index 1496167b..a0d2ff4b 100644 --- a/src/Xrefcheck/Config.hs +++ b/src/Xrefcheck/Config.hs @@ -73,6 +73,8 @@ data ScannersConfig' f = ScannersConfig , scAnchorSimilarityThreshold :: Field f Double -- ^ On 'anchor not found' error, how much similar anchors should be displayed as -- hint. Number should be between 0 and 1, larger value means stricter filter. + , scCopyPasteCheckEnabled :: Field f Bool + -- ^ Whether copy-paste check is enabled globally. } deriving stock (Generic) makeLensesWith postfixFields ''Config' @@ -94,6 +96,9 @@ overrideConfig config , scAnchorSimilarityThreshold = fromMaybe (scAnchorSimilarityThreshold defScanners) $ scAnchorSimilarityThreshold (cScanners config) + , scCopyPasteCheckEnabled = + fromMaybe (scCopyPasteCheckEnabled defScanners) + $ scCopyPasteCheckEnabled (cScanners config) } } where diff --git a/src/Xrefcheck/Config/Default.hs b/src/Xrefcheck/Config/Default.hs index 72bcd1a2..8550b0bd 100644 --- a/src/Xrefcheck/Config/Default.hs +++ b/src/Xrefcheck/Config/Default.hs @@ -67,6 +67,9 @@ scanners: # # This affects which anchors are generated for headers. flavor: #s{flavor} + + # Whether copy-paste check is enabled globally. + copyPasteCheckEnabled: True |] where ignoreLocalRefsFrom :: NonEmpty Text diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 9c3ae450..3863285f 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -61,7 +61,7 @@ instance FromJSON Flavor where -- We keep this in text because scanners for different formats use different -- representation of this thing, and it actually appears in reports only. newtype Position = Position (Maybe Text) - deriving stock (Show, Eq, Generic) + deriving stock (Show, Eq, Generic, Ord) instance Given ColorMode => Buildable Position where build (Position pos) = case pos of @@ -77,7 +77,7 @@ data Reference = Reference , rAnchor :: Maybe Text -- ^ Section or custom anchor tag. , rPos :: Position - } deriving stock (Show, Generic) + } deriving stock (Show, Generic, Eq, Ord) -- | Context of anchor. data AnchorType diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index 1ed1fdbb..b7daca93 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -26,20 +26,33 @@ module Xrefcheck.Verify , verifyReference , checkExternalResource + -- * Copypaste check + , checkCopyPaste + , CopyPasteCheckResult (..) + -- * URI parsing , parseUri + + -- * Reporting errors , reportVerifyErrs + , reportCopyPasteErrors ) where import Universum import Control.Concurrent.Async (Async, async, cancel, poll, wait, withAsync) import Control.Exception (AsyncException (..), throwIO) +import Control.Exception.Safe (handleAsync, handleJust) import Control.Monad.Except (MonadError (..)) +import Data.Bits (toIntegralSized) import Data.ByteString qualified as BS +import Data.Char (isAlphaNum) +import Data.List (lookup) import Data.List qualified as L import Data.Map qualified as M import Data.Reflection (Given) +import Data.Text (toCaseFold) +import Data.Text qualified as T import Data.Text.Metrics (damerauLevenshteinNorm) import Data.Time (UTCTime, defaultTimeLocale, formatTime, readPTime, rfc822DateFormat) import Data.Time.Clock.POSIX (getPOSIXTime) @@ -66,10 +79,6 @@ import Text.URI (Authority (..), ParseExceptionBs, URI (..), mkURIBs) import Time (RatioNat, Second, Time (..), ms, sec, threadDelay, timeout, (+:+), (-:-)) import URI.ByteString qualified as URIBS -import Control.Exception.Safe (handleAsync, handleJust) -import Data.Bits (toIntegralSized) -import Data.List (lookup) -import Data.Text (toCaseFold) import Xrefcheck.Config import Xrefcheck.Core import Xrefcheck.Orphans () @@ -255,6 +264,21 @@ instance Given ColorMode => Buildable VerifyError where #{redirectedUrl} |] +data CopyPasteCheckResult = CopyPasteCheckResult + { crFile :: FilePath, + crOriginalRef :: Reference, + crCopiedRef :: Reference + } deriving stock (Show, Eq, Ord) + +instance (Given ColorMode) => Buildable CopyPasteCheckResult where + build CopyPasteCheckResult {..} = + [int|| + In file #{styleIfNeeded Faint (styleIfNeeded Bold crFile)} + #{crCopiedRef}\ + is possibly a bad copy paste of + #{crOriginalRef} + |] + reportVerifyErrs :: Given ColorMode => NonEmpty (WithReferenceLoc VerifyError) -> IO () reportVerifyErrs errs = fmt @@ -265,6 +289,17 @@ reportVerifyErrs errs = fmt Invalid references dumped, #{length errs} in total. |] +reportCopyPasteErrors + :: Given ColorMode => NonEmpty CopyPasteCheckResult -> IO () +reportCopyPasteErrors errs = fmt + [int|| + === Possible copy/paste errors === + + #{interpolateIndentF 2 (interpolateBlockListF' "➥ " build errs)} + Possible copy/paste errors dumped, #{length errs} in total. + |] + + data RetryAfter = Date UTCTime | Seconds (Time Second) deriving stock (Show, Eq) @@ -355,7 +390,7 @@ verifyRepo -> VerifyMode -> FilePath -> RepoInfo - -> IO (VerifyResult $ WithReferenceLoc VerifyError) + -> IO (VerifyResult $ WithReferenceLoc VerifyError, [CopyPasteCheckResult]) verifyRepo rw config@Config{..} @@ -363,24 +398,32 @@ verifyRepo root repoInfo'@(RepoInfo files _) = do - let toScan = do - (file, fileInfo) <- M.toList files + + let filesToScan = flip mapMaybe (M.toList files) $ \(file, fileInfo) -> do guard . not $ matchesGlobPatterns root (ecIgnoreRefsFrom cExclusions) file case fileInfo of Scanned fi -> do - ref <- _fiReferences fi - return (file, ref) - NotScannable -> empty -- No support for such file, can do nothing. - NotAddedToGit -> empty -- If this file is scannable, we've notified + Just (file, fi) + NotScannable -> Nothing -- No support for such file, can do nothing. + NotAddedToGit -> Nothing -- If this file is scannable, we've notified -- user that we are scanning only files -- added to Git while gathering RepoInfo. + toCheckCopyPaste = map (second _fiReferences) filesToScan + toScan = concatMap (\(file, fileInfo) -> map (file,) $ _fiReferences fileInfo) filesToScan + copyPasteErrors = if scCopyPasteCheckEnabled cScanners + then [ res + | (file, refs) <- toCheckCopyPaste, + res <- checkCopyPaste file refs + ] + else [] + progressRef <- newIORef $ initVerifyProgress (map snd toScan) accumulated <- loopAsyncUntil (printer progressRef) do forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) -> verifyReference config mode progressRef repoInfo' root file ref - case accumulated of + (, copyPasteErrors) <$> case accumulated of Right res -> return $ fold res Left (exception, partialRes) -> do -- The user has hit Ctrl+C; display any verification errors we managed to find and exit. @@ -412,6 +455,41 @@ verifyRepo ExternalLoc -> CacheUnderKey rLink _ -> NoCaching +checkCopyPaste :: FilePath -> [Reference] -> [CopyPasteCheckResult] +checkCopyPaste file refs = do + let getLinkAndAnchor x = (rLink x, rAnchor x) + groupedRefs = + L.groupBy ((==) `on` getLinkAndAnchor) $ + sortBy (compare `on` getLinkAndAnchor) refs + concatMap checkGroup groupedRefs + where + checkGroup :: [Reference] -> [CopyPasteCheckResult] + checkGroup refsInGroup = do + let mergeLinkAndAnchor ref = maybe (rLink ref) (rLink ref <>) $ rAnchor ref + let refsInGroup' = flip map refsInGroup $ \ref -> + (ref, (prepareNameForCheck $ rName ref, + prepareNameForCheck $ mergeLinkAndAnchor ref)) + -- Most of time this will be Nothing and we won't need `others`. + -- The first matching link will be shown as original. + let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup' + others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup' + maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef + + textIsLinkSubstr :: (Text, Text) -> Bool + textIsLinkSubstr (prepName, prepLink) = prepName `isSubSeq` prepLink + + prepareNameForCheck :: Text -> Text + prepareNameForCheck = T.toLower . T.filter isAlphaNum + + isSubSeq :: Text -> Text -> Bool + isSubSeq "" _str = True + isSubSeq _que "" = False + isSubSeq que str + | qhead == shead = isSubSeq qtail stail + | otherwise = isSubSeq que stail + where (qhead, qtail) = T.splitAt 1 que + (shead, stail) = T.splitAt 1 str + shouldCheckLocType :: VerifyMode -> LocationType -> Bool shouldCheckLocType mode locType | isExternal locType = shouldCheckExternal mode diff --git a/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs b/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs new file mode 100644 index 00000000..fbe09ba6 --- /dev/null +++ b/tests/Test/Xrefcheck/CopyPasteCheckSpec.hs @@ -0,0 +1,65 @@ +{- SPDX-FileCopyrightText: 2019 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +module Test.Xrefcheck.CopyPasteCheckSpec where + +import Universum + +import Test.Tasty (TestTree, testGroup) +import Test.Tasty.HUnit (Assertion, testCase, (@?=)) + +import Xrefcheck.Core +import Xrefcheck.Verify + +assertUnordered :: (Show a, Ord a) => [a] -> [a] -> Assertion +assertUnordered = (@?=) `on` sort + +testPath :: FilePath +testPath = "test-path" + +test_copyPasteCheck :: TestTree +test_copyPasteCheck = testGroup "Copypaste check" + [ testCase "Detect copypaste error if there is a link with a matching name" $ do + let link = "./first-file" + anchor = Just "heading" + differentAnchor = Nothing + defPos = Position Nothing + original1 = Reference "_- First - - File" link anchor defPos + original2 = Reference "_- First - fi - le" link anchor defPos + notCopied = Reference " Link 2 " link differentAnchor defPos + copied1 = Reference " foo bar" link anchor defPos + copied2 = Reference " Baz quux" link anchor defPos + input = [original1, original2, notCopied, copied1, copied2] + res = checkCopyPaste testPath input + expectedRes = + -- only first matching link is shown in the output + [ CopyPasteCheckResult testPath original1 copied1 + , CopyPasteCheckResult testPath original1 copied2 + ] + res `assertUnordered` expectedRes + , testCase "Succeed if there is not link with a matching name" $ do + let link = "./first-file" + anchor = Just "heading" + defPos = Position Nothing + original1 = Reference "_Foo bar" link anchor defPos + original2 = Reference " Baz quux" link anchor defPos + original3 = Reference " Foo qubarx" link anchor defPos + input = [original1, original2, original3] + res = checkCopyPaste testPath input + expectedRes = [] + res @?= expectedRes + , testCase "Check external links" $ do + let link = "https://github.com" + anchor = Nothing + defPos = Position Nothing + original = Reference "github" link anchor defPos + copied = Reference "gitlab" link anchor defPos + input = [original, copied] + res = checkCopyPaste testPath input + expectedRes = + [ CopyPasteCheckResult testPath original copied + ] + res @?= expectedRes + ] diff --git a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs index c3143086..31bd1d53 100644 --- a/tests/Test/Xrefcheck/IgnoreRegexSpec.hs +++ b/tests/Test/Xrefcheck/IgnoreRegexSpec.hs @@ -44,7 +44,7 @@ test_ignoreRegex = give WithoutColors $ verifyRes <- allowRewrite showProgressBar $ \rw -> verifyRepo rw config verifyMode root $ srRepoInfo scanResult - let brokenLinks = pickBrokenLinks verifyRes + let brokenLinks = pickBrokenLinks $ fst verifyRes let matchedLinks = [ "https://bad.referenc/" diff --git a/tests/configs/github-config.yaml b/tests/configs/github-config.yaml index 35e3a088..c0698528 100644 --- a/tests/configs/github-config.yaml +++ b/tests/configs/github-config.yaml @@ -56,3 +56,6 @@ scanners: # # This affects which anchors are generated for headers. flavor: GitHub + + # Whether copy-paste check is enabled globally. + copyPasteCheckEnabled: True diff --git a/tests/golden/check-copy-paste/check-copy-paste.bats b/tests/golden/check-copy-paste/check-copy-paste.bats new file mode 100644 index 00000000..45971b8a --- /dev/null +++ b/tests/golden/check-copy-paste/check-copy-paste.bats @@ -0,0 +1,17 @@ +#!/usr/bin/env bats + +# SPDX-FileCopyrightText: 2022 Serokell +# +# SPDX-License-Identifier: MPL-2.0 + +load '../helpers/bats-support/load' +load '../helpers/bats-assert/load' +load '../helpers/bats-file/load' +load '../helpers' + + +@test "Check possible copy-paste errors and copy-paste annotations " { + to_temp xrefcheck + + assert_diff expected.gold +} diff --git a/tests/golden/check-copy-paste/expected.gold b/tests/golden/check-copy-paste/expected.gold new file mode 100644 index 00000000..796b4c73 --- /dev/null +++ b/tests/golden/check-copy-paste/expected.gold @@ -0,0 +1,48 @@ +=== Possible copy/paste errors === + + ➥ In file second-file.md + reference (relative) at src:13:1-29: + - text: "Lol Kek" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (relative) at src:14:1-30: + - text: "Baz quux" + - link: ./first-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:7:1-34: + - text: "First file" + - link: ./first-file.md + - anchor: - + + ➥ In file second-file.md + reference (relative) at src:24:1-29: + - text: "fdw" + - link: ./first-file.md + - anchor: chor + is possibly a bad copy paste of + reference (relative) at src:23:1-32: + - text: "ff-cho" + - link: ./first-file.md + - anchor: chor + + ➥ In file second-file.md + reference (external) at src:29:1-28: + - text: "gitlab" + - link: https://github.com + - anchor: - + is possibly a bad copy paste of + reference (external) at src:28:1-28: + - text: "github" + - link: https://github.com + - anchor: - + +Possible copy/paste errors dumped, 4 in total. +All repository links are valid. diff --git a/tests/golden/check-copy-paste/first-file.md b/tests/golden/check-copy-paste/first-file.md new file mode 100644 index 00000000..92b3319f --- /dev/null +++ b/tests/golden/check-copy-paste/first-file.md @@ -0,0 +1,11 @@ + + +# heading + +# anch + +# chor diff --git a/tests/golden/check-copy-paste/second-file.md b/tests/golden/check-copy-paste/second-file.md new file mode 100644 index 00000000..02f4d554 --- /dev/null +++ b/tests/golden/check-copy-paste/second-file.md @@ -0,0 +1,29 @@ + + +[ First file ](./first-file.md) + + +[ Link 3](./first-file.md#heading) + + +[ Lol Kek](./first-file.md) +[ Baz quux](./first-file.md) + + + +[ asd](./first-file.md#anch) +[ fdw](./first-file.md#anch) + + + +[ ff-cho](./first-file.md#chor) +[ fdw](./first-file.md#chor) + + + +[github](https://github.com) +[gitlab](https://github.com) From d497a1b40556ac4b95533d225496415c03fe90dd Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Fri, 23 Dec 2022 16:04:27 +0500 Subject: [PATCH 2/5] fixup! [#64] Implement copy/paste protection checks Review: formatting, partition --- src/Xrefcheck/Verify.hs | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index b7daca93..41ce8a8a 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -47,7 +47,6 @@ import Control.Monad.Except (MonadError (..)) import Data.Bits (toIntegralSized) import Data.ByteString qualified as BS import Data.Char (isAlphaNum) -import Data.List (lookup) import Data.List qualified as L import Data.Map qualified as M import Data.Reflection (Given) @@ -265,9 +264,9 @@ instance Given ColorMode => Buildable VerifyError where |] data CopyPasteCheckResult = CopyPasteCheckResult - { crFile :: FilePath, - crOriginalRef :: Reference, - crCopiedRef :: Reference + { crFile :: FilePath + , crOriginalRef :: Reference + , crCopiedRef :: Reference } deriving stock (Show, Eq, Ord) instance (Given ColorMode) => Buildable CopyPasteCheckResult where @@ -299,8 +298,6 @@ reportCopyPasteErrors errs = fmt Possible copy/paste errors dumped, #{length errs} in total. |] - - data RetryAfter = Date UTCTime | Seconds (Time Second) deriving stock (Show, Eq) @@ -411,12 +408,14 @@ verifyRepo toCheckCopyPaste = map (second _fiReferences) filesToScan toScan = concatMap (\(file, fileInfo) -> map (file,) $ _fiReferences fileInfo) filesToScan - copyPasteErrors = if scCopyPasteCheckEnabled cScanners - then [ res - | (file, refs) <- toCheckCopyPaste, - res <- checkCopyPaste file refs - ] - else [] + copyPasteErrors = + if scCopyPasteCheckEnabled cScanners + then + [ res + | (file, refs) <- toCheckCopyPaste, + res <- checkCopyPaste file refs + ] + else [] progressRef <- newIORef $ initVerifyProgress (map snd toScan) @@ -466,13 +465,12 @@ checkCopyPaste file refs = do checkGroup :: [Reference] -> [CopyPasteCheckResult] checkGroup refsInGroup = do let mergeLinkAndAnchor ref = maybe (rLink ref) (rLink ref <>) $ rAnchor ref - let refsInGroup' = flip map refsInGroup $ \ref -> + let refsInGroup' = refsInGroup <&> \ref -> (ref, (prepareNameForCheck $ rName ref, prepareNameForCheck $ mergeLinkAndAnchor ref)) - -- Most of time this will be Nothing and we won't need `others`. - -- The first matching link will be shown as original. - let mbSubstrRef = fst <$> find (textIsLinkSubstr . snd) refsInGroup' - others = fst <$> filter (not . textIsLinkSubstr . snd) refsInGroup' + let (mbSubstrRef, others) = + bimap (fmap fst . safeHead) (map fst) $ + L.partition (textIsLinkSubstr . snd) refsInGroup' maybe [] (\substrRef -> map (CopyPasteCheckResult file substrRef) others) mbSubstrRef textIsLinkSubstr :: (Text, Text) -> Bool @@ -830,7 +828,7 @@ checkExternalResource Config{..} link | isPermanentRedirectCode code -> Left . PermanentRedirectError link . fmap decodeUtf8 - . lookup "Location" + . L.lookup "Location" $ responseHeaders resp | isTemporaryRedirectCode code -> Right () | isAllowedErrorCode code -> Right () From 2e0d0f6a315da1e21ad6d4a18d6d23c3e6fef6a6 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Fri, 23 Dec 2022 16:20:05 +0500 Subject: [PATCH 3/5] fixup! [#64] Implement copy/paste protection checks Review: fix config, README, CHANGES --- CHANGES.md | 2 ++ README.md | 11 +++++++++++ src/Xrefcheck/Config/Default.hs | 4 ++-- tests/configs/github-config.yaml | 4 ++-- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index a4d037eb..f1655df6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -41,6 +41,8 @@ Unreleased * [#231](https://github.com/serokell/xrefcheck/pull/231) + Anchor analysis takes now into account the appropriate case-sensitivity depending on the configured Markdown flavour. +* [#245](https://github.com/serokell/xrefcheck/pull/245) + + Now xrefcheck is able to detect possible copy-pastes relying on links and their names. 0.2.2 ========== diff --git a/README.md b/README.md index 73d76c56..2d59128a 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Comparing to alternative solutions, this tool tries to achieve the following poi * Supports relative and absolute local links. * Supports external links (`http`, `https`, `ftp` and `ftps`). * Detects broken and ambiguous anchors in local links. +* Detects possible bad copy-pastes of links. * Integration with GitHub Actions. ## Dependencies [↑](#xrefcheck) @@ -148,6 +149,16 @@ There are several ways to fix this: * By default, `xrefcheck` will ignore links to localhost. * This behavior can be disabled by removing the corresponding entry from the `ignoreExternalRefsTo` list in the config file. +1. How does the copy-paste check work? + * If there are some references with the same links and for one of them name is similar to the link itself, other links may turn out to be copypasted from that link. Example: + ```md + Links with bad copypaste: + [good link](https://good.link.uri/). + [copypasted link](https://good.link.uri/). + ``` + The second link will be reported by `xrefcheck` as a possible bad copy-paste of the first link. + * This checks both external references and local ones. + ## Further work [↑](#xrefcheck) - [ ] Support link detection in different languages, not only Markdown. diff --git a/src/Xrefcheck/Config/Default.hs b/src/Xrefcheck/Config/Default.hs index 8550b0bd..5378d9e2 100644 --- a/src/Xrefcheck/Config/Default.hs +++ b/src/Xrefcheck/Config/Default.hs @@ -68,8 +68,8 @@ scanners: # This affects which anchors are generated for headers. flavor: #s{flavor} - # Whether copy-paste check is enabled globally. - copyPasteCheckEnabled: True + # Whether the copy-paste check is enabled globally. + copyPasteCheckEnabled: true |] where ignoreLocalRefsFrom :: NonEmpty Text diff --git a/tests/configs/github-config.yaml b/tests/configs/github-config.yaml index c0698528..1722de1a 100644 --- a/tests/configs/github-config.yaml +++ b/tests/configs/github-config.yaml @@ -57,5 +57,5 @@ scanners: # This affects which anchors are generated for headers. flavor: GitHub - # Whether copy-paste check is enabled globally. - copyPasteCheckEnabled: True + # Whether the copy-paste check is enabled globally. + copyPasteCheckEnabled: true From a844583bb9bf2dbf8cb6bf3c85f8ffc4cffbb829 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Fri, 23 Dec 2022 18:56:45 +0500 Subject: [PATCH 4/5] fixup! fixup! [#64] Implement copy/paste protection checks --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2d59128a..f84e416d 100644 --- a/README.md +++ b/README.md @@ -150,9 +150,9 @@ There are several ways to fix this: * This behavior can be disabled by removing the corresponding entry from the `ignoreExternalRefsTo` list in the config file. 1. How does the copy-paste check work? - * If there are some references with the same links and for one of them name is similar to the link itself, other links may turn out to be copypasted from that link. Example: + * If there are some references with the same links and for one of them the link's text is similar to the link itself, other links may turn out to be copypasted from that link. Example: ```md - Links with bad copypaste: + Links with bad copy-paste: [good link](https://good.link.uri/). [copypasted link](https://good.link.uri/). ``` From fd71f9798f2465c6e4962420cf5a8314eee0c919 Mon Sep 17 00:00:00 2001 From: Yuri Romanowski Date: Fri, 23 Dec 2022 19:10:13 +0500 Subject: [PATCH 5/5] fixup! [#64] Implement copy/paste protection checks Fix tests --- src/Xrefcheck/Command.hs | 4 +- tests/golden/check-copy-paste/expected.gold | 63 ++++++++++++++++---- tests/golden/check-copy-paste/second-file.md | 5 +- 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/src/Xrefcheck/Command.hs b/src/Xrefcheck/Command.hs index 12ee272e..6a13c5b2 100644 --- a/src/Xrefcheck/Command.hs +++ b/src/Xrefcheck/Command.hs @@ -86,7 +86,9 @@ defaultAction Options{..} = do { cNetworking = addNetworkingOptions (cNetworking config) oNetworkingOptions } verifyRepo rw fullConfig oMode oRoot repoInfo - whenJust (nonEmpty copyPasteErrors) reportCopyPasteErrors + whenJust (nonEmpty copyPasteErrors) $ \errs -> do + reportCopyPasteErrors errs + fmt "\n" case verifyErrors verifyRes of Nothing | null scanErrs -> fmtLn "All repository links are valid." Nothing -> exitFailure diff --git a/tests/golden/check-copy-paste/expected.gold b/tests/golden/check-copy-paste/expected.gold index 796b4c73..544e7a94 100644 --- a/tests/golden/check-copy-paste/expected.gold +++ b/tests/golden/check-copy-paste/expected.gold @@ -23,26 +23,67 @@ - anchor: - ➥ In file second-file.md - reference (relative) at src:24:1-29: + reference (relative) at src:23:1-33: + - text: "fubarw" + - link: ./nonexistent-file.md + - anchor: - + is possibly a bad copy paste of + reference (relative) at src:24:1-36: + - text: "nonexfile" + - link: ./nonexistent-file.md + - anchor: - + + ➥ In file second-file.md + reference (relative) at src:25:1-30: - text: "fdw" - - link: ./first-file.md - - anchor: chor + - link: ./nonexistent-file.md + - anchor: - is possibly a bad copy paste of - reference (relative) at src:23:1-32: - - text: "ff-cho" - - link: ./first-file.md - - anchor: chor + reference (relative) at src:24:1-36: + - text: "nonexfile" + - link: ./nonexistent-file.md + - anchor: - ➥ In file second-file.md - reference (external) at src:29:1-28: + reference (external) at src:30:1-28: - text: "gitlab" - link: https://github.com - anchor: - is possibly a bad copy paste of - reference (external) at src:28:1-28: + reference (external) at src:29:1-28: - text: "github" - link: https://github.com - anchor: - -Possible copy/paste errors dumped, 4 in total. -All repository links are valid. +Possible copy/paste errors dumped, 5 in total. + +=== Invalid references found === + + ➥ In file second-file.md + bad reference (relative) at src:23:1-33: + - text: "fubarw" + - link: ./nonexistent-file.md + - anchor: - + + File does not exist: + nonexistent-file.md + + ➥ In file second-file.md + bad reference (relative) at src:24:1-36: + - text: "nonexfile" + - link: ./nonexistent-file.md + - anchor: - + + File does not exist: + nonexistent-file.md + + ➥ In file second-file.md + bad reference (relative) at src:25:1-30: + - text: "fdw" + - link: ./nonexistent-file.md + - anchor: - + + File does not exist: + nonexistent-file.md + +Invalid references dumped, 3 in total. diff --git a/tests/golden/check-copy-paste/second-file.md b/tests/golden/check-copy-paste/second-file.md index 02f4d554..171c55e4 100644 --- a/tests/golden/check-copy-paste/second-file.md +++ b/tests/golden/check-copy-paste/second-file.md @@ -20,8 +20,9 @@ -[ ff-cho](./first-file.md#chor) -[ fdw](./first-file.md#chor) +[ fubarw](./nonexistent-file.md) +[ nonexfile](./nonexistent-file.md) +[ fdw](./nonexistent-file.md)