From 0f313a9dbd59c8a2d1d25e61f10e2ffbf5989290 Mon Sep 17 00:00:00 2001 From: Naymul Islam Date: Sun, 3 Sep 2023 12:17:13 +0600 Subject: [PATCH 1/2] update the constant name and imports Constants name is transformed into all uppercase and remove an unimportant comment in import and removed some unimportant imports in urllib module --- .idea/.gitignore | 3 +++ .idea/WikiExtractor.iml | 12 +++++++++ .../inspectionProfiles/profiles_settings.xml | 6 +++++ .idea/misc.xml | 4 +++ .idea/modules.xml | 8 ++++++ .idea/vcs.xml | 6 +++++ WikiExtractor.py | 25 +++++++++---------- 7 files changed, 51 insertions(+), 13 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/WikiExtractor.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/WikiExtractor.iml b/.idea/WikiExtractor.iml new file mode 100644 index 0000000..8b8c395 --- /dev/null +++ b/.idea/WikiExtractor.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a971a2c --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..2034c7c --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/WikiExtractor.py b/WikiExtractor.py index 03f92fa..18dac67 100755 --- a/WikiExtractor.py +++ b/WikiExtractor.py @@ -52,12 +52,11 @@ import argparse import gc import sys -import urllib.request, urllib.parse, urllib.error +import urllib.request import re import bz2 import os.path from html.entities import name2codepoint -#import fnmatch import shutil import mimetypes import gzip @@ -70,7 +69,7 @@ ### PARAMS #################################################################### # This is obtained from the dump itself -prefix = None +PREFIX = None ## # Whether to preseve links in output @@ -86,12 +85,12 @@ # Recognize only these namespaces # w: Internal links to the Wikipedia # -acceptedNamespaces = set(['w']) +ACCEPTED_NAMESPACES= set(['w']) ## # Drop these elements from article text # -discardElements = set([ +DISCARD_ELEMENTS = set([ 'gallery', 'timeline', 'noinclude', 'pre', 'table', 'tr', 'td', 'th', 'caption', 'form', 'input', 'select', 'option', 'textarea', @@ -132,7 +131,7 @@ ## print(footer, file=out) def WikiDocumentSentences(out, id, title, tags, text): - url = get_url(id, prefix) + url = get_url(id, PREFIX) header = '\n{0}:{1}'.format(title, "|||".join(tags)) # Separate header from text with a newline. text = clean(text) @@ -176,7 +175,7 @@ def normalizeTitle(title): rest = m.group(3) ns = prefix.capitalize() - if ns in acceptedNamespaces: + if ns in ACCEPTED_NAMESPACES: # If the prefix designates a known namespace, then it might be # followed by optional whitespace that should be removed to get # the canonical page name @@ -224,7 +223,7 @@ def fixup(m): # Match elements to ignore discard_element_patterns = [] -for tag in discardElements: +for tag in DISCARD_ELEMENTS: pattern = re.compile(r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag, tag), re.DOTALL | re.IGNORECASE) discard_element_patterns.append(pattern) @@ -353,7 +352,7 @@ def make_anchor_tag(match): global keepLinks link = match.group(1) colon = link.find(':') - if colon > 0 and link[:colon] not in acceptedNamespaces: + if colon > 0 and link[:colon] not in ACCEPTED_NAMESPACES: return '' trail = match.group(3) anchor = match.group(2) @@ -587,7 +586,7 @@ def file_name(self): def process_data(ftype, input, output_sentences, output_structure, incubator, vital_titles=None, vital_tags=None): - global prefix + global PREFIX page = [] id = None inText = False @@ -625,7 +624,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator, page.append(line) elif tag == '/page': colon = title.find(':') - if (colon < 0 or title[:colon] in acceptedNamespaces) and \ + if (colon < 0 or title[:colon] in ACCEPTED_NAMESPACES) and \ not redirect: if (not vital_titles) or (title in vital_titles): if((incubator != '') and (lang[1] == incubator) and len(lang) > 2): @@ -648,7 +647,7 @@ def process_data(ftype, input, output_sentences, output_structure, incubator, # discover prefix from the xml dump file # /mediawiki/siteinfo/base base = m.group(3) - prefix = base[:base.rfind("/")] + PREFIX = base[:base.rfind("/")] ##def load_vital_titles(vitalfn): ## """Given the filename for the vital titles list (one title per line, with @@ -698,7 +697,7 @@ def get_argparser(): return parser def main(): - global keepLinks, keepSections, prefix, acceptedNamespaces + global keepLinks, keepSections, PREFIX, ACCEPTED_NAMESPACES script_name = os.path.basename(sys.argv[0]) parser = get_argparser() From 59b0bbb2784f1636f2476a9314817d286c4c1571 Mon Sep 17 00:00:00 2001 From: Naymul Islam Date: Thu, 7 Sep 2023 17:09:23 +0600 Subject: [PATCH 2/2] Removed folder --- .idea/.gitignore | 3 --- .idea/WikiExtractor.iml | 12 ------------ .idea/inspectionProfiles/profiles_settings.xml | 6 ------ .idea/misc.xml | 4 ---- .idea/modules.xml | 8 -------- .idea/vcs.xml | 6 ------ 6 files changed, 39 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/WikiExtractor.iml delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/.idea/WikiExtractor.iml b/.idea/WikiExtractor.iml deleted file mode 100644 index 8b8c395..0000000 --- a/.idea/WikiExtractor.iml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index a971a2c..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 2034c7c..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1dd..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file