cognitive-metascience · DiSuVe · May 4, 2024 · Jul 3, 2024 · Oct 18, 2024
diff --git a/.idea/.name b/.idea/.name
diff --git a/.idea/corpora_conversion_tools.iml b/.idea/corpora_conversion_tools.iml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/add_metadata.py b/add_metadata.py
@@ -0,0 +1,143 @@
+import re
+import os
+import json
+
+
+class AddMetadata:
+    def __init__(self, input_directory, output_directory):
+        self.input_directory = input_directory.rstrip(os.sep)
+        self.output_directory = output_directory.rstrip(os.sep)
+
+    def process_articles(self):
+        reviewed_articles_dir = os.path.join(self.input_directory, 'reviewed_articles')
+
+        for root, dirs, files in os.walk(reviewed_articles_dir):
+            for filename in files:
+                # I'm only working with .txt files here
+                if filename.endswith('.txt'):
+                    base_name = os.path.splitext(filename)[0]
+                    if self.is_supplement_file(base_name):
+                        continue
+
+                    txt_file_path = os.path.join(root, filename)
+                    json_file_path = os.path.join(root, f'{base_name}.json')
+
+                    relative_path = os.path.relpath(root, self.input_directory)
+                    output_dir_path = os.path.join(self.output_directory, relative_path)
+                    os.makedirs(output_dir_path, exist_ok=True)
+                    output_file_path = os.path.join(output_dir_path, filename)
+
+                    self.process_single_file(txt_file_path, json_file_path, output_file_path)
+
+    def is_supplement_file(self, file_name):
+        # Assuming all files with an 's' before the digit are supplementary files
+        if re.search(r'\.s\d+', file_name, re.IGNORECASE):
+            return True
+        else:
+            return False
+
+    def process_single_file(self, txt_file_path, json_file_path, output_file_path):
+        text_content = self.read_text_file(txt_file_path)
+        if text_content is None:
+            print(f'{txt_file_path} is empty. Skipping file.')
+            return
+
+        metadata = self.read_json_file(json_file_path)
+        if metadata is None:
+            print(f'{json_file_path} does not contain metadata.')
+            attributes_str = ''
+        else:
+            attributes_str = self.metadata_to_attributes(metadata)
+
+        doc_tags = f'<doc {attributes_str}>\n{text_content}\n</doc>'
+
+        with open(output_file_path, 'w', encoding='utf-8') as output_file:
+            output_file.write(doc_tags)
+        print(f'Metadata added: {output_file_path}')
+
+    def read_text_file(self, file_path):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+                print(f'Text file: {file_path} read successfully.')
+                return content
+        except UnicodeDecodeError:
+            print(f'Couldn\'t read {file_path} with utf-8 encoding due to a UnicodeDecodeError.')
+            return None
+        except FileNotFoundError:
+            print(f'Text file not found: {file_path}')
+            return None
+
+    def read_json_file(self, file_path):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as json_file:
+                data = json.load(json_file)
+                print(f'Read metadata from: {file_path}.')
+                return data
+        except FileNotFoundError:
+            print(f'JSON file not found: {file_path}')
+            return None
+        except json.JSONDecodeError as e:
+            print(f'Error decoding JSON file {file_path}: {e}')
+            return None
+
+    def metadata_to_attributes(self, metadata):
+        attributes = []
+        for key, value in metadata.items():
+            attr_string = self.process_metadata_item(key, value)
+            if attr_string:
+                attributes.append(attr_string)
+        return ' '.join(attributes)
+
+    def process_metadata_item(self, key, value):
+        if isinstance(value, dict):
+            return self.process_dict_attribute(key, value)
+        elif isinstance(value, list):
+            return self.process_list_attribute(key, value)
+        else:
+            attr_value = str(value).replace('"', '&quot;')
+            return f'{key}="{attr_value}"'
+
+    def process_dict_attribute(self, parent_key, value_dict):
+        attributes = []
+        for sub_key, sub_value in value_dict.items():
+            full_key = f"{parent_key}_{sub_key}"
+            attr_value = str(sub_value).replace('"', '&quot;')
+            attributes.append(f'{full_key}="{attr_value}"')
+        return ' '.join(attributes)
+
+    def process_list_attribute(self, key, value_list):
+        if all(isinstance(item, dict) for item in value_list):
+            # List of dictionaries
+            flattened = self.flatten_dicts_list(value_list)
+            attributes = []
+            for sub_key, sub_value in flattened.items():
+                full_key = f"{key}_{sub_key}"
+                attr_value = sub_value.replace('"', '&quot;')
+                attributes.append(f'{full_key}="{attr_value}"')
+            return ' '.join(attributes)
+        else:
+            # List of simple values
+            value_str = '; '.join(str(item) for item in value_list)
+            attr_value = value_str.replace('"', '&quot;')
+            return f'{key}="{attr_value}"'
+
+    def flatten_dicts_list(self, dicts_list):
+        flattened = {}
+        for item in dicts_list:
+            for sub_key, sub_value in item.items():
+                sub_value_str = str(sub_value)
+                if sub_key in flattened:
+                    flattened[sub_key].append(sub_value_str)
+                else:
+                    flattened[sub_key] = [sub_value_str]
+        # Join all values separated by ;
+        return {key: '; '.join(values) for key, values in flattened.items()}
+
+
+if __name__ == '__main__':
+    input_dir = '/mdpi_review'
+    output_dir = '/mdpi_review/metadata_articles'
+
+    processor = AddMetadata(input_dir, output_dir)
+    processor.process_articles()
diff --git a/functions2txt.py b/functions2txt.py
@@ -7,24 +7,10 @@
 import aspose.words as aw
 import re
 from pdfminer.high_level import extract_text
-from bs4 import BeautifulSoup as bs
 import xml.etree.ElementTree as ET
 from metadata import json_to_sgml
 
 
-def docx_2txt(file):
-    """
-    Converts a docx file to a txt file.
-
-    Args:
-        file (str): The path to the docx file.
-
-    Returns:
-        str: The content of the converted txt file.
-    """
-    return docx2txt.process(file)
-
-
 class Converter2vertical:
     def __init__(self, inpath, outpath):
         """
@@ -40,7 +26,7 @@ def __init__(self, inpath, outpath):
         self.inpath = inpath
         self.outpath = outpath
         self.extensions = ['.docx', '.doc', '.xml', '.pdf', '.txt']
-        self.extensions_dict = {'.docx': docx2txt,
+        self.extensions_dict = {'.docx': self.docx_2txt,
                                 '.doc': self.doc2txt,
                                 '.xml': self.xml2txt,
                                 '.pdf': self.pdf2txt, '.txt': lambda x: x.read()}
@@ -115,7 +101,10 @@ def doc2txt(self, file):
         Returns:
             str: The text content of the converted txt file.
         """
-        return aw.Document(file).get_text()
+        doc_text = aw.Document(file)
+        text = doc_text.get_text().splitlines()
+        clean_text = '\n'.join(text[1:-4])
+        return clean_text
 
     def pdf2txt(self, file):
         """
@@ -148,6 +137,18 @@ def xml2txt(self, file):
         root = ET.tostring(tree.getroot(), encoding='utf-8', method='text')
         return root.decode('utf-8')
 
+    def docx_2txt(self, file):
+        """
+        Converts a docx file to a txt file.
+
+        Args:
+            file (str): The path to the docx file.
+
+        Returns:
+            str: The content of the converted txt file.
+        """
+        return docx2txt.process(file)
+
     def txt2vertical(self, text):
         """
         Converts a text file to a vertical file.

diff --git a/input_dir/dummy.json b/input_dir/dummy.json
@@ -0,0 +1 @@
+{"author": "Test, Name", "title": "Test title", "journal": "Test journal", "year": "2030", "doi": "10.5555/12345678"}
diff --git a/input_dir/dummy.txt b/input_dir/dummy.txt
@@ -0,0 +1 @@
+This is a test text.
diff --git a/input_dir/valid_metadata.json b/input_dir/valid_metadata.json
@@ -0,0 +1 @@
+{"author": "Test, Name", "title": "Test title", "journal": "Test journal", "year": "2030", "doi": "10.5555/12345678"}
diff --git a/metadata.py b/metadata.py
@@ -8,6 +8,7 @@
 import json
 import xml.etree.ElementTree as ET
 
+
 def json_to_sgml(json_data, parent):
     if isinstance(json_data, dict):
         # Convert dictionary items to attributes of the parent element
@@ -27,35 +28,41 @@ def json_to_sgml(json_data, parent):
         # Add text content for non-list values
         parent.text = str(json_data)
 
+
 # Check if a JSON filename is provided as a command-line argument
-if __name__ == '__main__':
+class InvalidArgumentException(Exception):
+    pass
+
+
+try:
     if len(sys.argv) != 2:
-        print("Usage: python json_to_sgml.py <input_json_file>")
-        sys.exit(1)
+        raise InvalidArgumentException("Usage: python test_using_real_files.py <filename>")
 
     # Get the JSON filename from the command line argument
     json_filename = sys.argv[1]
 
-    try:
-        # Load JSON data from the provided file
-        with open(json_filename, 'r') as json_file:
-            json_data = json.load(json_file)
+    # Load JSON data from the provided file
+    with open(json_filename, 'r') as json_file:
+        json_data = json.load(json_file)
 
-        # Create the root "doc" element
-        root = ET.Element("doc")
+    # Create the root "doc" element
+    root = ET.Element("doc")
 
-        # Convert JSON to SGML/XML
-        json_to_sgml(json_data, root)
+    # Convert JSON to SGML/XML
+    json_to_sgml(json_data, root)
 
-        # Create an ElementTree object
-        tree = ET.ElementTree(root)
+    # Create an ElementTree object
+    tree = ET.ElementTree(root)
 
-        # Print or save the XML as needed
-        xml_str = ET.tostring(root, encoding="unicode")
-        # Do not close the doc element: we will add sentences after it.
-        print(xml_str[:-6])
+    # Print or save the XML as needed
+    xml_str = ET.tostring(root, encoding="unicode")
+    # Do not close the doc element: we will add sentences after it.
+    print(xml_str[:-6])
 
-    except FileNotFoundError:
-        print(f"File not found: {json_filename}")
-    except json.JSONDecodeError as e:
-        print(f"Error parsing JSON: {e}")
+except InvalidArgumentException as e:
+    print(e)
+    sys.exit(1)
+except FileNotFoundError:
+    print(f"File not found: {json_filename}")
+except json.JSONDecodeError as e:
+    print(f"Error parsing JSON: {e}")
diff --git a/output_dir/test_doc2txt/brainsci6010001.s12_converted.txt b/output_dir/test_doc2txt/brainsci6010001.s12_converted.txt
@@ -0,0 +1,59 @@
+ HYPERLINK "input_dir//user/login" Login  HYPERLINK "input_dir//user/register" Register  HYPERLINK "input_dir//user/manuscripts/upload" Submit
+ HYPERLINK "https://susy.mdpi.com"  
+ HYPERLINK "https://www.mdpi.com/about/journals/" \t "_blank" Journals
+ HYPERLINK "https://www.mdpi.com/guidelines" \t "_blank" Information 
+ HYPERLINK "https://www.mdpi.com/authors" \t "_blank" For Authors
+ HYPERLINK "https://www.mdpi.com/reviewers" \t "_blank" For Reviewers
+ HYPERLINK "https://www.mdpi.com/editors" \t "_blank" For Editors
+ HYPERLINK "https://www.mdpi.com/librarians" \t "_blank" For Librarians
+ HYPERLINK "https://www.mdpi.com/publishing_service" \t "_blank" For Publishers
+ HYPERLINK "https://www.mdpi.com/societies" \t "_blank" For Societies
+ HYPERLINK "https://www.mdpi.com/about/apc" \t "_blank" Article Processing Charges
+ HYPERLINK "https://www.mdpi.com/about/openaccess" \t "_blank" Open Access Policy
+ HYPERLINK "https://www.mdpi.com/about/memberships" \t "_blank" Institutional Open Access Program
+ HYPERLINK "https://www.mdpi.com/editorial_process" \t "_blank" Editorial Process
+ HYPERLINK "https://www.mdpi.com/awards" \t "_blank" Awards
+ HYPERLINK "https://www.mdpi.com/authors/english" \t "_blank" English Editing Service
+ HYPERLINK "javascript:void(0);" Initiatives 
+ HYPERLINK "https://sciforum.net" \t "_blank" Sciforum
+ HYPERLINK "https://www.mdpi.com/books" \t "_blank" MDPI Books
+ HYPERLINK "https://www.preprints.org" \t "_blank" Preprints
+ HYPERLINK "https://www.scilit.net" \t "_blank" Scilit
+ HYPERLINK "https://sciprofiles.com" \t "_blank" SciProfiles
+ HYPERLINK "https://encyclopedia.pub" \t "_blank" Encyclopedia
+ HYPERLINK "https://jams.pub" \t "_blank" JAMS
+ HYPERLINK "https://www.mdpi.com/about/proceedings" \t "_blank" Proceedings Series
+ HYPERLINK "https://www.mdpi.com/about/" \t "_blank" About
+ HYPERLINK "input_dir//switch/desktop_mobile_layout" \o "Switch to desktop view"  HYPERLINK "input_dir//user/manuscripts/upload" Submit
+ HYPERLINK "input_dir//user/login" Login  HYPERLINK "javascript:void(0);" 
+Error 404 - File not found
+The webpage you are looking could not be found.
+The URL may have been incorrectly typed, or the page may
+have been moved into another part of the mdpi.com site.
+ HYPERLINK "input_dir//user/myprofile" Return to the user page
+ HYPERLINK "https://www.mdpi.com/about/contact" Contact
+© 1996-2022 MDPI (Basel, Switzerland) unless otherwise stated 
+ HYPERLINK "javascript:void(0);" \o "The statements, opinions and data contained in the journals are solely those of the individual authors and contributors and not of the publisher and the editor(s)." Disclaimer  HYPERLINK "https://www.mdpi.com/about/terms-and-conditions" \t "_blank" Terms and Conditions  HYPERLINK "https://www.mdpi.com/about/privacy" \t "_blank" Privacy Policy
+ HYPERLINK "https://www.mdpi.com" \t "_blank" MDPI
+ HYPERLINK "https://www.mdpi.com/about/journals" \t "_blank" Journals
+ HYPERLINK "https://www.mdpi.com/guidelines" Information 
+ HYPERLINK "https://www.mdpi.com/authors" \t "_blank" For Authors
+ HYPERLINK "https://www.mdpi.com/reviewers" \t "_blank" For Reviewers
+ HYPERLINK "https://www.mdpi.com/editors" \t "_blank" For Editors
+ HYPERLINK "https://www.mdpi.com/librarians" \t "_blank" For Librarians
+ HYPERLINK "https://www.mdpi.com/publishing_service" \t "_blank" For Publishers
+ HYPERLINK "https://www.mdpi.com/societies" \t "_blank" For Societies
+ HYPERLINK "https://www.mdpi.com/about/apc" \t "_blank" Article Processing Charges
+ HYPERLINK "https://www.mdpi.com/about/openaccess" \t "_blank" Open Access Policy
+ HYPERLINK "https://www.mdpi.com/about/memberships" \t "_blank" Institutional Open Access Program
+ HYPERLINK "https://www.mdpi.com/editorial_process" \t "_blank" Editorial Process
+ HYPERLINK "https://www.mdpi.com/awards" \t "_blank" Awards
+ HYPERLINK "https://www.mdpi.com/authors/english" \t "_blank" English Editing Service
+ HYPERLINK "javascript:void(0);" Initiatives 
+ HYPERLINK "https://sciforum.net" \t "_blank" Sciforum
+ HYPERLINK "https://www.mdpi.com/books" \t "_blank" MDPI Books
+ HYPERLINK "https://www.preprints.org" \t "_blank" Preprints
+ HYPERLINK "https://www.scilit.net" \t "_blank" Scilit
+ HYPERLINK "https://sciprofiles.com" \t "_blank" SciProfiles
+ HYPERLINK "https://encyclopedia.pub" \t "_blank" Encyclopedia
+ HYPERLINK "https://jams.pub" \t "_blank" JAMS
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"author": "Test, Name", "title": "Test title", "journal": "Test journal", "year": "2030", "doi": "10.5555/12345678"}