Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .idea/.name

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/corpora_conversion_tools.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 1 addition & 3 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

143 changes: 143 additions & 0 deletions add_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import re
import os
import json


class AddMetadata:
def __init__(self, input_directory, output_directory):
self.input_directory = input_directory.rstrip(os.sep)
self.output_directory = output_directory.rstrip(os.sep)

def process_articles(self):
reviewed_articles_dir = os.path.join(self.input_directory, 'reviewed_articles')

for root, dirs, files in os.walk(reviewed_articles_dir):
for filename in files:
# I'm only working with .txt files here
if filename.endswith('.txt'):
base_name = os.path.splitext(filename)[0]
if self.is_supplement_file(base_name):
continue

txt_file_path = os.path.join(root, filename)
json_file_path = os.path.join(root, f'{base_name}.json')

relative_path = os.path.relpath(root, self.input_directory)
output_dir_path = os.path.join(self.output_directory, relative_path)
os.makedirs(output_dir_path, exist_ok=True)
output_file_path = os.path.join(output_dir_path, filename)

self.process_single_file(txt_file_path, json_file_path, output_file_path)

def is_supplement_file(self, file_name):
# Assuming all files with an 's' before the digit are supplementary files
if re.search(r'\.s\d+', file_name, re.IGNORECASE):
return True
else:
return False

def process_single_file(self, txt_file_path, json_file_path, output_file_path):
text_content = self.read_text_file(txt_file_path)
if text_content is None:
print(f'{txt_file_path} is empty. Skipping file.')
return

metadata = self.read_json_file(json_file_path)
if metadata is None:
print(f'{json_file_path} does not contain metadata.')
attributes_str = ''
else:
attributes_str = self.metadata_to_attributes(metadata)

doc_tags = f'<doc {attributes_str}>\n{text_content}\n</doc>'

with open(output_file_path, 'w', encoding='utf-8') as output_file:
output_file.write(doc_tags)
print(f'Metadata added: {output_file_path}')

def read_text_file(self, file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
print(f'Text file: {file_path} read successfully.')
return content
except UnicodeDecodeError:
print(f'Couldn\'t read {file_path} with utf-8 encoding due to a UnicodeDecodeError.')
return None
except FileNotFoundError:
print(f'Text file not found: {file_path}')
return None

def read_json_file(self, file_path):
try:
with open(file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
print(f'Read metadata from: {file_path}.')
return data
except FileNotFoundError:
print(f'JSON file not found: {file_path}')
return None
except json.JSONDecodeError as e:
print(f'Error decoding JSON file {file_path}: {e}')
return None

def metadata_to_attributes(self, metadata):
attributes = []
for key, value in metadata.items():
attr_string = self.process_metadata_item(key, value)
if attr_string:
attributes.append(attr_string)
return ' '.join(attributes)

def process_metadata_item(self, key, value):
if isinstance(value, dict):
return self.process_dict_attribute(key, value)
elif isinstance(value, list):
return self.process_list_attribute(key, value)
else:
attr_value = str(value).replace('"', '&quot;')
return f'{key}="{attr_value}"'

def process_dict_attribute(self, parent_key, value_dict):
attributes = []
for sub_key, sub_value in value_dict.items():
full_key = f"{parent_key}_{sub_key}"
attr_value = str(sub_value).replace('"', '&quot;')
attributes.append(f'{full_key}="{attr_value}"')
return ' '.join(attributes)

def process_list_attribute(self, key, value_list):
if all(isinstance(item, dict) for item in value_list):
# List of dictionaries
flattened = self.flatten_dicts_list(value_list)
attributes = []
for sub_key, sub_value in flattened.items():
full_key = f"{key}_{sub_key}"
attr_value = sub_value.replace('"', '&quot;')
attributes.append(f'{full_key}="{attr_value}"')
return ' '.join(attributes)
else:
# List of simple values
value_str = '; '.join(str(item) for item in value_list)
attr_value = value_str.replace('"', '&quot;')
return f'{key}="{attr_value}"'

def flatten_dicts_list(self, dicts_list):
flattened = {}
for item in dicts_list:
for sub_key, sub_value in item.items():
sub_value_str = str(sub_value)
if sub_key in flattened:
flattened[sub_key].append(sub_value_str)
else:
flattened[sub_key] = [sub_value_str]
# Join all values separated by ;
return {key: '; '.join(values) for key, values in flattened.items()}


if __name__ == '__main__':
input_dir = '/mdpi_review'
output_dir = '/mdpi_review/metadata_articles'

processor = AddMetadata(input_dir, output_dir)
processor.process_articles()
33 changes: 17 additions & 16 deletions functions2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,10 @@
import aspose.words as aw
import re
from pdfminer.high_level import extract_text
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET
from metadata import json_to_sgml


def docx_2txt(file):
"""
Converts a docx file to a txt file.

Args:
file (str): The path to the docx file.

Returns:
str: The content of the converted txt file.
"""
return docx2txt.process(file)


class Converter2vertical:
def __init__(self, inpath, outpath):
"""
Expand All @@ -40,7 +26,7 @@ def __init__(self, inpath, outpath):
self.inpath = inpath
self.outpath = outpath
self.extensions = ['.docx', '.doc', '.xml', '.pdf', '.txt']
self.extensions_dict = {'.docx': docx2txt,
self.extensions_dict = {'.docx': self.docx_2txt,
'.doc': self.doc2txt,
'.xml': self.xml2txt,
'.pdf': self.pdf2txt, '.txt': lambda x: x.read()}
Expand Down Expand Up @@ -115,7 +101,10 @@ def doc2txt(self, file):
Returns:
str: The text content of the converted txt file.
"""
return aw.Document(file).get_text()
doc_text = aw.Document(file)
text = doc_text.get_text().splitlines()
clean_text = '\n'.join(text[1:-4])
return clean_text

def pdf2txt(self, file):
"""
Expand Down Expand Up @@ -148,6 +137,18 @@ def xml2txt(self, file):
root = ET.tostring(tree.getroot(), encoding='utf-8', method='text')
return root.decode('utf-8')

def docx_2txt(self, file):
"""
Converts a docx file to a txt file.

Args:
file (str): The path to the docx file.

Returns:
str: The content of the converted txt file.
"""
return docx2txt.process(file)

def txt2vertical(self, text):
"""
Converts a text file to a vertical file.
Expand Down
1 change: 1 addition & 0 deletions input_dir/dummy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"author": "Test, Name", "title": "Test title", "journal": "Test journal", "year": "2030", "doi": "10.5555/12345678"}
1 change: 1 addition & 0 deletions input_dir/dummy.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a test text.
1 change: 1 addition & 0 deletions input_dir/valid_metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"author": "Test, Name", "title": "Test title", "journal": "Test journal", "year": "2030", "doi": "10.5555/12345678"}
49 changes: 28 additions & 21 deletions metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
import xml.etree.ElementTree as ET


def json_to_sgml(json_data, parent):
if isinstance(json_data, dict):
# Convert dictionary items to attributes of the parent element
Expand All @@ -27,35 +28,41 @@ def json_to_sgml(json_data, parent):
# Add text content for non-list values
parent.text = str(json_data)


# Check if a JSON filename is provided as a command-line argument
if __name__ == '__main__':
class InvalidArgumentException(Exception):
pass


try:
if len(sys.argv) != 2:
print("Usage: python json_to_sgml.py <input_json_file>")
sys.exit(1)
raise InvalidArgumentException("Usage: python test_using_real_files.py <filename>")

# Get the JSON filename from the command line argument
json_filename = sys.argv[1]

try:
# Load JSON data from the provided file
with open(json_filename, 'r') as json_file:
json_data = json.load(json_file)
# Load JSON data from the provided file
with open(json_filename, 'r') as json_file:
json_data = json.load(json_file)

# Create the root "doc" element
root = ET.Element("doc")
# Create the root "doc" element
root = ET.Element("doc")

# Convert JSON to SGML/XML
json_to_sgml(json_data, root)
# Convert JSON to SGML/XML
json_to_sgml(json_data, root)

# Create an ElementTree object
tree = ET.ElementTree(root)
# Create an ElementTree object
tree = ET.ElementTree(root)

# Print or save the XML as needed
xml_str = ET.tostring(root, encoding="unicode")
# Do not close the doc element: we will add sentences after it.
print(xml_str[:-6])
# Print or save the XML as needed
xml_str = ET.tostring(root, encoding="unicode")
# Do not close the doc element: we will add sentences after it.
print(xml_str[:-6])

except FileNotFoundError:
print(f"File not found: {json_filename}")
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
except InvalidArgumentException as e:
print(e)
sys.exit(1)
except FileNotFoundError:
print(f"File not found: {json_filename}")
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
59 changes: 59 additions & 0 deletions output_dir/test_doc2txt/brainsci6010001.s12_converted.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
 HYPERLINK "input_dir//user/login" Login  HYPERLINK "input_dir//user/register" Register  HYPERLINK "input_dir//user/manuscripts/upload" Submit
 HYPERLINK "https://susy.mdpi.com"  
 HYPERLINK "https://www.mdpi.com/about/journals/" \t "_blank" Journals
 HYPERLINK "https://www.mdpi.com/guidelines" \t "_blank" Information
 HYPERLINK "https://www.mdpi.com/authors" \t "_blank" For Authors
 HYPERLINK "https://www.mdpi.com/reviewers" \t "_blank" For Reviewers
 HYPERLINK "https://www.mdpi.com/editors" \t "_blank" For Editors
 HYPERLINK "https://www.mdpi.com/librarians" \t "_blank" For Librarians
 HYPERLINK "https://www.mdpi.com/publishing_service" \t "_blank" For Publishers
 HYPERLINK "https://www.mdpi.com/societies" \t "_blank" For Societies
 HYPERLINK "https://www.mdpi.com/about/apc" \t "_blank" Article Processing Charges
 HYPERLINK "https://www.mdpi.com/about/openaccess" \t "_blank" Open Access Policy
 HYPERLINK "https://www.mdpi.com/about/memberships" \t "_blank" Institutional Open Access Program
 HYPERLINK "https://www.mdpi.com/editorial_process" \t "_blank" Editorial Process
 HYPERLINK "https://www.mdpi.com/awards" \t "_blank" Awards
 HYPERLINK "https://www.mdpi.com/authors/english" \t "_blank" English Editing Service
 HYPERLINK "javascript:void(0);" Initiatives
 HYPERLINK "https://sciforum.net" \t "_blank" Sciforum
 HYPERLINK "https://www.mdpi.com/books" \t "_blank" MDPI Books
 HYPERLINK "https://www.preprints.org" \t "_blank" Preprints
 HYPERLINK "https://www.scilit.net" \t "_blank" Scilit
 HYPERLINK "https://sciprofiles.com" \t "_blank" SciProfiles
 HYPERLINK "https://encyclopedia.pub" \t "_blank" Encyclopedia
 HYPERLINK "https://jams.pub" \t "_blank" JAMS
 HYPERLINK "https://www.mdpi.com/about/proceedings" \t "_blank" Proceedings Series
 HYPERLINK "https://www.mdpi.com/about/" \t "_blank" About
 HYPERLINK "input_dir//switch/desktop_mobile_layout" \o "Switch to desktop view"  HYPERLINK "input_dir//user/manuscripts/upload" Submit
 HYPERLINK "input_dir//user/login" Login  HYPERLINK "javascript:void(0);" 
Error 404 - File not found
The webpage you are looking could not be found.
The URL may have been incorrectly typed, or the page may
have been moved into another part of the mdpi.com site.
 HYPERLINK "input_dir//user/myprofile" Return to the user page
 HYPERLINK "https://www.mdpi.com/about/contact" Contact
© 1996-2022 MDPI (Basel, Switzerland) unless otherwise stated
 HYPERLINK "javascript:void(0);" \o "The statements, opinions and data contained in the journals are solely those of the individual authors and contributors and not of the publisher and the editor(s)." Disclaimer  HYPERLINK "https://www.mdpi.com/about/terms-and-conditions" \t "_blank" Terms and Conditions  HYPERLINK "https://www.mdpi.com/about/privacy" \t "_blank" Privacy Policy
 HYPERLINK "https://www.mdpi.com" \t "_blank" MDPI
 HYPERLINK "https://www.mdpi.com/about/journals" \t "_blank" Journals
 HYPERLINK "https://www.mdpi.com/guidelines" Information
 HYPERLINK "https://www.mdpi.com/authors" \t "_blank" For Authors
 HYPERLINK "https://www.mdpi.com/reviewers" \t "_blank" For Reviewers
 HYPERLINK "https://www.mdpi.com/editors" \t "_blank" For Editors
 HYPERLINK "https://www.mdpi.com/librarians" \t "_blank" For Librarians
 HYPERLINK "https://www.mdpi.com/publishing_service" \t "_blank" For Publishers
 HYPERLINK "https://www.mdpi.com/societies" \t "_blank" For Societies
 HYPERLINK "https://www.mdpi.com/about/apc" \t "_blank" Article Processing Charges
 HYPERLINK "https://www.mdpi.com/about/openaccess" \t "_blank" Open Access Policy
 HYPERLINK "https://www.mdpi.com/about/memberships" \t "_blank" Institutional Open Access Program
 HYPERLINK "https://www.mdpi.com/editorial_process" \t "_blank" Editorial Process
 HYPERLINK "https://www.mdpi.com/awards" \t "_blank" Awards
 HYPERLINK "https://www.mdpi.com/authors/english" \t "_blank" English Editing Service
 HYPERLINK "javascript:void(0);" Initiatives
 HYPERLINK "https://sciforum.net" \t "_blank" Sciforum
 HYPERLINK "https://www.mdpi.com/books" \t "_blank" MDPI Books
 HYPERLINK "https://www.preprints.org" \t "_blank" Preprints
 HYPERLINK "https://www.scilit.net" \t "_blank" Scilit
 HYPERLINK "https://sciprofiles.com" \t "_blank" SciProfiles
 HYPERLINK "https://encyclopedia.pub" \t "_blank" Encyclopedia
 HYPERLINK "https://jams.pub" \t "_blank" JAMS
Loading