From 6d8074ddda70043fe9696ecf76d93981c3f445f0 Mon Sep 17 00:00:00 2001 From: Benjam <53127823+benjamsf@users.noreply.github.com> Date: Sun, 21 May 2023 17:20:57 +0300 Subject: [PATCH 1/6] Add export_docs functionality which saves separate files from Lutherscripts word tokenized corpus json --- lutherscripts/cli.py | 12 ++++++++---- ...python_createseparatefiles_from_metadata.py | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py diff --git a/lutherscripts/cli.py b/lutherscripts/cli.py index addde21..18e4c6e 100644 --- a/lutherscripts/cli.py +++ b/lutherscripts/cli.py @@ -13,7 +13,7 @@ def add_arguments(parser): - parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler or word_document_probability") + parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability", "export_docs"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler, export_docs or word_document_probability") parser.add_argument("-1", "--first-detail", type=float, help="First detail flag for operation, depends on the operation") parser.add_argument("-2", "--second-detail", type=float, help="Second detail flag for operation, depends on the operation") parser.add_argument("-3", "--third-detail", type=int, help="Third detail flag for operation, depends on the operation") @@ -52,6 +52,10 @@ def word_document_probability(source_path, corpus_path, dictionary_path, destina from src.text_processing.gensim_word_document_probability import main as gensim_word_document_probability output = gensim_word_document_probability(source_path, corpus_path, dictionary_path, destination_path) +def export_docs(source_path, destination_path) + from src.text_preparation.python_createseparatefiles_from_metadata import main as export_docs + output = export_docs(source_path, destination_path) + def cli_main(): parser = argparse.ArgumentParser(description="Lutherscript operations launcher") add_arguments(parser) @@ -91,9 +95,9 @@ def cli_main(): freq_analysis(source_path, destination_path) elif args.operation == 'build_corpus': build_corpus(source_path, destination_path, args.first_detail, args.second_detail) + elif args.operation == 'export_docs': + export_docs(source_path, destination_path) if __name__ == '__main__': - cli_main() - - + cli_main() \ No newline at end of file diff --git a/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py b/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py new file mode 100644 index 0000000..54e494c --- /dev/null +++ b/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py @@ -0,0 +1,18 @@ +import json + +# A little script to save documents from the json corpus as +# separate files, for to work with certain 3rd party tools like Voyant + + +def save_documents_to_files(source_path, destination_path): + # Load the corpus from json + with open(source_path, 'r') as f: + documents = json.load(f) + + for i, document in enumerate(documents): + # Replace "metadata" with "title" + document['title'] = document.pop('metadata') + + # Save each document as independent file + with open(os.path.join(destination_path, f'document_{i}.json'), 'w') as f: + json.dump(document, f) \ No newline at end of file From 1d1b21aa96a216b498bc05c9d07d12d5ac02ee4c Mon Sep 17 00:00:00 2001 From: regularjoe <53127823+benjamsf@users.noreply.github.com> Date: Sun, 10 Mar 2024 16:33:09 +0200 Subject: [PATCH 2/6] lutherscripts JSON array to separate txt files To integrate well with VoyantTools --- lutherscripts/cli.py | 2 +- lutherscripts/gui.py | 6 ++-- lutherscripts/src/data/extrastopwords.py | 4 +++ ...ython_createseparatefiles_from_metadata.py | 34 ++++++++++++------- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/lutherscripts/cli.py b/lutherscripts/cli.py index 18e4c6e..cb941ec 100644 --- a/lutherscripts/cli.py +++ b/lutherscripts/cli.py @@ -52,7 +52,7 @@ def word_document_probability(source_path, corpus_path, dictionary_path, destina from src.text_processing.gensim_word_document_probability import main as gensim_word_document_probability output = gensim_word_document_probability(source_path, corpus_path, dictionary_path, destination_path) -def export_docs(source_path, destination_path) +def export_docs(source_path, destination_path): from src.text_preparation.python_createseparatefiles_from_metadata import main as export_docs output = export_docs(source_path, destination_path) diff --git a/lutherscripts/gui.py b/lutherscripts/gui.py index e439f7d..e40b0c9 100644 --- a/lutherscripts/gui.py +++ b/lutherscripts/gui.py @@ -121,7 +121,8 @@ def gui_main(): ("kwic_analysis", "Perform KWIC analysis from your JSON word tokenized text"), ("freq_analysis", "Perform word frequency analysis from your JSON word tokenized text"), ("build_corpus", "Build a dictionary and corpus from your JSON word tokenized text"), - ("topic_modeling", "Perform Topic Modeling from your dictionary and corpus") + ("topic_modeling", "Perform Topic Modeling from your dictionary and corpus"), + ("export_docs", "Export the tokenized JSON to multiple txt documents") ] def update_explanation(*args): @@ -131,7 +132,8 @@ def update_explanation(*args): "Perform KWIC analysis from your JSON word tokenized text": "This operation will perform a Key Word in Context (KWIC) analysis, allowing you to see the occurrences of a word within the context of the text, using NLTK. Source must be a Word Tokenized text in JSON format.", "Perform word frequency analysis": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.", "Build a dictionary and corpus from your JSON word tokenized text": "This operation will build a dictionary and a corpus from your Word Tokenized text in JSON format using GenSim, for to source further operations. As Arg 1 pass minimum appearance of a word in a document corpus to be accepted to the corpus, as Arg 2 pass the maximum in a fraction of a document to do the same.", - "Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy." + "Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy.", + "Export docs": "Lutherscripts Latin tokenizer will output the source to a JSON array. Export that to separate txt documents for work with tools like Voyant." } selected_operation = var_operation.get() diff --git a/lutherscripts/src/data/extrastopwords.py b/lutherscripts/src/data/extrastopwords.py index eceb0da..54ca81a 100644 --- a/lutherscripts/src/data/extrastopwords.py +++ b/lutherscripts/src/data/extrastopwords.py @@ -129,6 +129,7 @@ 'este', 'esto', 'et', + 'etc', 'etenim', 'etiam', 'etiamsi', @@ -323,6 +324,7 @@ 'non', 'nondum', 'nonne', + 'nolo', 'nos', 'noster', 'nostra', @@ -360,6 +362,7 @@ 'p', 'paene', 'paro1', + 'pars', 'pauca', 'paulus', 'paulus1', @@ -438,6 +441,7 @@ 'quis', 'quisque', 'quo', + '-que', 'quoad', 'quod', 'quodsi', diff --git a/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py b/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py index 54e494c..e2cfd3b 100644 --- a/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py +++ b/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py @@ -1,18 +1,28 @@ import json +import os +from tqdm import tqdm # Import the tqdm function for the progress bar -# A little script to save documents from the json corpus as -# separate files, for to work with certain 3rd party tools like Voyant - - -def save_documents_to_files(source_path, destination_path): - # Load the corpus from json +def main(source_path, destination_path): + # Ensure the destination directory exists + os.makedirs(destination_path, exist_ok=True) + + # Load the corpus from JSON with open(source_path, 'r') as f: documents = json.load(f) - for i, document in enumerate(documents): - # Replace "metadata" with "title" - document['title'] = document.pop('metadata') + # Process each document with a progress bar + for i, document in tqdm(enumerate(documents), total=len(documents), desc="Processing documents"): + # Extract the title ("metadata") and the document body ("tokens") + title = document.get('metadata', f'Document {i}') # Use a default title if missing + body = ', '.join(document.get('tokens', [])) # Convert tokens list to string, with commas + + # Combine the title and the body with two newlines in between + content = f"{title}\n\n{body}" + + # Save each document as a separate .txt file + filename = os.path.join(destination_path, f'document_{i}.txt') + with open(filename, 'w', encoding='utf-8') as f: + f.write(content) + + print(f'The JSON has been exported to separate txt documents at {destination_path}.') - # Save each document as independent file - with open(os.path.join(destination_path, f'document_{i}.json'), 'w') as f: - json.dump(document, f) \ No newline at end of file From b830b8fb99be5dc4b488e9b5906caf15162caf50 Mon Sep 17 00:00:00 2001 From: regularjoe <53127823+benjamsf@users.noreply.github.com> Date: Sun, 10 Mar 2024 16:45:06 +0200 Subject: [PATCH 3/6] fixit --- ...ython_createseparatefiles_from_metadata.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py b/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py index e2cfd3b..9a232f4 100644 --- a/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py +++ b/lutherscripts/src/text_preparation/python_createseparatefiles_from_metadata.py @@ -2,6 +2,13 @@ import os from tqdm import tqdm # Import the tqdm function for the progress bar +def sanitize_filename(title): + """Remove disallowed characters and shorten the filename if necessary.""" + # Replace any characters not allowed in file names with underscores + safe_title = "".join(c if c.isalnum() or c in " -_" else "_" for c in title) + # Shorten the title if it's too long for a filename + return safe_title[:250] # Filesystem limit, can be adjusted + def main(source_path, destination_path): # Ensure the destination directory exists os.makedirs(destination_path, exist_ok=True) @@ -12,17 +19,20 @@ def main(source_path, destination_path): # Process each document with a progress bar for i, document in tqdm(enumerate(documents), total=len(documents), desc="Processing documents"): - # Extract the title ("metadata") and the document body ("tokens") - title = document.get('metadata', f'Document {i}') # Use a default title if missing - body = ', '.join(document.get('tokens', [])) # Convert tokens list to string, with commas - - # Combine the title and the body with two newlines in between - content = f"{title}\n\n{body}" + # Extract the title ("metadata") and use it as the filename + title = document.get('metadata', f'Document_{i}') # Use a default title if missing + # Convert tokens list to string, with commas + body = ', '.join(document.get('tokens', [])) + + # Make sure the title is safe to use as a filename + filename_title = sanitize_filename(title) - # Save each document as a separate .txt file - filename = os.path.join(destination_path, f'document_{i}.txt') + # Save each document as a separate .txt file, named after the title + filename = os.path.join(destination_path, f'{filename_title}.txt') with open(filename, 'w', encoding='utf-8') as f: - f.write(content) + f.write(body) # Write only the body without the title inside the file print(f'The JSON has been exported to separate txt documents at {destination_path}.') +# Example usage +# main('source.json', 'destination_folder') From ba96a44ae5c775d92a3486f11d5854736247c93f Mon Sep 17 00:00:00 2001 From: regularjoe <53127823+benjamsf@users.noreply.github.com> Date: Tue, 12 Mar 2024 22:28:16 +0200 Subject: [PATCH 4/6] updates --- lutherscripts/gui.py | 205 +++++++++++------- .../cltk_sentencetokenize_latin_arg.py | 77 +++++-- .../cltk_wordtokenize_latin_arg.py | 59 ++--- .../text_processing/nltk_do_freqanalysis.py | 27 +-- testrun1_wordtokenizer2.bat | 2 +- testrun2_corpusbuilder.bat | 2 +- testrun3_topicmodeler.bat | 2 +- 7 files changed, 222 insertions(+), 152 deletions(-) diff --git a/lutherscripts/gui.py b/lutherscripts/gui.py index e40b0c9..9547f05 100644 --- a/lutherscripts/gui.py +++ b/lutherscripts/gui.py @@ -12,41 +12,47 @@ from concurrent.futures import ThreadPoolExecutor import asyncio import time +import queue + __author__ = "benjamsf" __license__ = "MIT" -stop_flag = [False] +# This flag and queue are used for communication between threads and updating the GUI +stop_flag = False +message_queue = queue.Queue() class CustomTextRedirector: - def __init__(self, widget): + def __init__(self, widget, single_line_mode=False): self.widget = widget - self.widget.configure(background='black', foreground='green') # Set background and text color - self.encoding = 'utf-8' # Set the encoding for the widget + self.single_line_mode = single_line_mode + self.widget.configure(background='black', foreground='green', font=('Arial')) def write(self, message): - self.widget.configure(state='normal') - self.widget.insert(tk.END, message.encode(self.encoding)) # Encode the message with the specified encoding - self.widget.see(tk.END) - self.widget.configure(state='disabled') - self.widget.update() + if self.widget.winfo_exists(): + self.widget.configure(state='normal') + if self.single_line_mode: + # Replace last line + current_content = self.widget.get("1.0", tk.END).splitlines() + current_content[-1] = message.strip() # Ensure to strip newlines for single line mode + new_content = "\n".join(current_content) + self.widget.delete("1.0", tk.END) + self.widget.insert("1.0", new_content) + else: + # Append new message + self.widget.insert(tk.END, message) + self.widget.see(tk.END) + self.widget.configure(state='disabled') def flush(self): pass - def readline(self): - return '' - -def create_image_label(parent, root, frames): - lbl_luther_image = tk.Label(parent, image=frames[0]) - lbl_luther_image.grid(row=0, rowspan=8, column=0, padx=10, pady=10) - - return lbl_luther_image + def set_single_line_mode(self, mode): + self.single_line_mode = mode def gui_main(): - root = tk.Tk() root.geometry("1500x640") root.title("Lutherscripts (Dev version) - A NLP toolset for Latin language") @@ -130,10 +136,10 @@ def update_explanation(*args): "Tokenize Latin text by words": "This operation will tokenize your Latin text by words, which is required for further word-based natural language processing, using CLTK. You can manually segmentate the text via inserting a headline in a format #Detail,Otherdetail,Thirddetail# and end marker of the segment as #end#. That will be interpreted by the tokenizer as a single document, with metadata provided in the header", "Tokenize Latin text by sentences": "This operation will tokenize your Latin text by sentences, which is useful for sentence-based natural language processing, using CLTK. As of dev version, not in the par of the other operations.", "Perform KWIC analysis from your JSON word tokenized text": "This operation will perform a Key Word in Context (KWIC) analysis, allowing you to see the occurrences of a word within the context of the text, using NLTK. Source must be a Word Tokenized text in JSON format.", - "Perform word frequency analysis": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.", + "Perform word frequency analysis from your JSON word tokenized text": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.", "Build a dictionary and corpus from your JSON word tokenized text": "This operation will build a dictionary and a corpus from your Word Tokenized text in JSON format using GenSim, for to source further operations. As Arg 1 pass minimum appearance of a word in a document corpus to be accepted to the corpus, as Arg 2 pass the maximum in a fraction of a document to do the same.", "Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy.", - "Export docs": "Lutherscripts Latin tokenizer will output the source to a JSON array. Export that to separate txt documents for work with tools like Voyant." + "Export the tokenized JSON to multiple txt documents": "Lutherscripts Latin tokenizer will output the source to a JSON array. Export that to separate txt documents for work with tools like Voyant." } selected_operation = var_operation.get() @@ -178,6 +184,21 @@ def choose_output_file(): root.grid_rowconfigure(5, weight=1) sys.stdout = CustomTextRedirector(txt_terminal) sys.stderr = CustomTextRedirector(txt_terminal) + + def update_txt_terminal(): + try: + while not message_queue.empty(): + message = message_queue.get_nowait() + txt_terminal.configure(state='normal') + txt_terminal.insert(tk.END, message) + txt_terminal.see(tk.END) + txt_terminal.configure(state='disabled') + except queue.Empty: + pass # No more messages to display + finally: + # Reschedule this function to run again after 100 ms + root.after(100, update_txt_terminal) + def update_image_label(lbl, frames): @@ -185,7 +206,83 @@ def update_image_label(lbl, frames): frames.append(frame) lbl.config(image=frame) + def start_async_operation(): + """Start the async operation in a new thread.""" + try: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(run_script_async()) + except Exception as e: + print(f"An error occurred: {e}") # Print or log the error + finally: + # Ensure these actions are performed back in the main GUI thread + root.after(0, finalize_operation) + + def finalize_operation(): + """Re-enable the button and stop the animation after the operation is done.""" + global stop_flag + stop_flag = True # Stop the animation + btn_play.configure(state='normal') # Re-enable the button + + def animate_luther(): + global stop_flag + while not stop_flag: + update_image_label(lbl_luther_image, frames) + root.update() + time.sleep(interval) + + def start_operation(): + global stop_flag + stop_flag = False + btn_play.configure(state='disabled') + txt_terminal.configure(state='normal') + txt_terminal.delete(1.0, tk.END) # Clear existing text + txt_terminal.configure(state='disabled') + + # Run the async operation in a separate thread + threading.Thread(target=run_script_async, daemon=True).start() + + # Update the GUI periodically + update_txt_terminal() + # Check the selected operation and validate arguments for KWIC analysis + operation_name = [option[0] for option in options if option[1] == var_operation.get()][0] + if operation_name == "kwic_analysis": + argument1 = ent_argument1.get() + argument2 = ent_argument2.get() + if not argument1 or not argument2: + print("Please enter both Argument1 (Keyword) and argument2 (Context length, a number of words you want to see left and right of a keyword hit) for the KWIC analysis") + return + if operation_name == "topic_modeling": + argument1 = ent_argument1.get() + argument2 = ent_argument2.get() + if not argument1 or not argument2: + print("Please enter both Argument1 (Number of Topics) and argument2 (Number of Corpus Passes during LDA Training) ") + return + + async_thread = threading.Thread(target=start_async_operation, daemon=True) + async_thread.start() + + # Start the animation thread + stop_flag[0] = False + animation_thread = threading.Thread(target=animate_luther, args=(stop_flag,)) + animation_thread.daemon = True + animation_thread.start() + + print("Starting operation...") + print("Please wait, this might take couple of seconds...") + btn_play.configure(state='disabled') + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + # Call the main function with the callback function + loop.run_until_complete(run_script_async()) + loop.close() + btn_play.configure(state='normal') + stop_flag[0] = True + + async def run_script_async(): operation_name = [option[0] for option in options if option[1] == var_operation.get()][0] source_path = os.path.normpath(location_raw_sourcetext) @@ -213,24 +310,23 @@ async def run_script_async(): ) lbl_luther_image.config(image=gif1) + - output = '' + output_buffer = '' # Buffer to collect output while True: char = await process.stdout.read(1) if not char: break - char = char.decode(errors='replace') + output_buffer += char.decode(errors='replace') + + if '\n' in output_buffer or '\r' in output_buffer: + # Put the buffer into the queue and reset it + message_queue.put(output_buffer) + output_buffer = '' + # Ensure any remaining output is sent to the queue + if output_buffer: + message_queue.put(output_buffer) - if char == '\r': - txt_terminal.configure(state='normal') - txt_terminal.delete(f'{tk.END} -2c linestart', tk.END) - txt_terminal.insert(tk.END, output) - txt_terminal.see(tk.END) - txt_terminal.configure(state='disabled') - txt_terminal.update() - output = '' - else: - output += char stderr_data = await process.stderr.read() if stderr_data: @@ -239,49 +335,6 @@ async def run_script_async(): txt_terminal.see(tk.END) txt_terminal.configure(state='disabled') txt_terminal.update() - - def animate_luther(stop_flag): - while not stop_flag[0]: - update_image_label(lbl_luther_image, frames) - root.update() - time.sleep(interval) - - - def start_operation(): - # Check the selected operation and validate arguments for KWIC analysis - operation_name = [option[0] for option in options if option[1] == var_operation.get()][0] - if operation_name == "kwic_analysis": - argument1 = ent_argument1.get() - argument2 = ent_argument2.get() - if not argument1 or not argument2: - print("Please enter both Argument1 (Keyword) and argument2 (Context length, a number of words you want to see left and right of a keyword hit) for the KWIC analysis") - return - if operation_name == "topic_modeling": - argument1 = ent_argument1.get() - argument2 = ent_argument2.get() - if not argument1 or not argument2: - print("Please enter both Argument1 (Number of Topics) and argument2 (Number of Corpus Passes during LDA Training) ") - return - - # Start the animation thread - stop_flag[0] = False - animation_thread = threading.Thread(target=animate_luther, args=(stop_flag,)) - animation_thread.daemon = True - animation_thread.start() - - print("Starting operation...") - print("Please wait, this might take couple of seconds...") - - btn_play.configure(state='disabled') - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - # Call the main function with the callback function - loop.run_until_complete(run_script_async()) - loop.close() - btn_play.configure(state='normal') - stop_flag[0] = True # Start Operation! button btn_play = tk.Button(root, text="Start Operation!", command=start_operation) @@ -297,6 +350,8 @@ def start_operation(): sys.stdout = CustomTextRedirector(txt_terminal) sys.stderr = CustomTextRedirector(txt_terminal) + update_txt_terminal() # Start checking the queue + # Start the GUI root.mainloop() diff --git a/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py b/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py index fb37cdb..d44fdf1 100644 --- a/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py +++ b/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py @@ -1,46 +1,77 @@ -import io import os import re +import sys import json from cltk import NLP from tqdm import tqdm import logging -import sys -import io - +from cltk.stops.lat import STOPS as LATIN_STOPS +from lutherscripts.src.data.extrastopwords import extrastopwords_lat as EXTRA_STOPS +from cltk.lemmatize.lat import LatinBackoffLemmatizer def main(source_path, destination_path, progress_callback=None): + logging.basicConfig(level=logging.INFO) + # Instantiate a Latin-specific NLP object cltk_nlp = NLP(language="lat") - logging.basicConfig(level=logging.INFO) + # Instantiate a Latin-specific lemmatizer + latin_lemmatizer = LatinBackoffLemmatizer() - # Load the Latin text from the source file input_file = os.path.abspath(source_path) logging.info(f"Reading input from file: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: input_text = f.read() - # Split the input text into smaller chunks based on punctuation - chunk_delimiters = r'[.!?]+' - text_chunks = re.split(chunk_delimiters, input_text) + # Split the input text into documents + input_documents = re.split(r'(?<=#end#)', input_text, flags=re.IGNORECASE) + + document_sentences = [] - # Process the text_chunks with cltk_nlp and update the progress bar - sentence_tokens = [] - for chunk in tqdm(text_chunks, desc="Tokenizing sentences"): - doc = cltk_nlp(chunk) + for document in tqdm(input_documents, desc="Processing documents", file=sys.stdout): + if not document.strip(): + continue + + # Extract metadata from the document + metadata = re.search(r'#(.*?)#', document).group(1) + + # Remove metadata from the document + document_text = re.sub(r'#.*?#', '', document) + + # Convert the document text to lowercase + document_text = document_text.lower() + + # Remove punctuation marks, digits, and special characters from the document + document_no_punctuation = re.sub(r'[^\w\s]', '', document_text) + document_no_digits = re.sub(r'\d+', '', document_no_punctuation) + + # Tokenize the document into sentences + doc = cltk_nlp(document_no_digits) + + cleaned_sentences = [] for sentence in doc.sentences: - sentence_text = ' '.join([word.string for word in sentence.words]) - sentence_tokens.append(sentence_text.strip()) - if progress_callback: - progress_callback(len(sentence_tokens) / len(text_chunks)) + # Lemmatize and remove stopwords from each sentence + lemmatized_words = [ + latin_lemmatizer.lemmatize([word.string])[0][1].lower() + for word in sentence.words + if word.string.lower() not in LATIN_STOPS and word.string.lower() not in EXTRA_STOPS + ] + cleaned_sentence = ' '.join(lemmatized_words).strip() + if cleaned_sentence: # Ensure the sentence is not empty + cleaned_sentences.append(cleaned_sentence) + + # Append the processed sentences along with metadata + document_sentences.append({"metadata": metadata, "sentences": cleaned_sentences}) - # Save the tokenized output to a JSON file - output_file = os.path.abspath(destination_path) - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(sentence_tokens, f, ensure_ascii=False) + # Save the tokenized and lemmatized output to a JSON file + output_file = os.path.abspath(destination_path) + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(document_sentences, f, ensure_ascii=False) - # Print a message to confirm that the file has been saved - print(f'The tokenized output has been saved as {output_file}') + # Print a message to confirm that the file has been saved + print(f'The tokenized and lemmatized output has been saved as {output_file}') +if __name__ == '__main__': + # Example usage + main('source.json', 'destination.json') diff --git a/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py b/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py index a4f3eec..e3817d7 100644 --- a/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py +++ b/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py @@ -12,68 +12,51 @@ import re def main(source_path, destination_path, progress_callback=None): - logging.basicConfig(level=logging.INFO) - # Instantiate a Latin-specific NLP object cltk_nlp = NLP(language="lat") - - # Instantiate a Latin-specific lemmatizer latin_lemmatizer = LatinBackoffLemmatizer() input_file = os.path.abspath(source_path) - # Load the Latin text from the source file with open(input_file, 'r', encoding='utf-8') as f: input_text = f.read() - # Split the input text into documents input_documents = re.split(r'(?<=#end#)', input_text, flags=re.IGNORECASE) document_tokens = [] - for document in input_documents: + for document in tqdm(input_documents, desc="Processing documents", unit="document", file=sys.stdout): if not document.strip(): continue - # Extract metadata from the document - metadata = re.search(r'#(.*?)#', document).group(1) - - # Remove metadata from the document - document = re.sub(r'#.*?#', '', document) - - # Convert the document text to lowercase - document = document.lower() - - # Remove punctuation marks, digits, and special characters from the document - document_no_punctuation = re.sub(r'[^\w\s]', '', document) - - # Find all numerals that are not within metadata and remove them - num_pattern = r'(? 2: - word_tokens.append(lemma) + # Process each word in the document_text + words = document_text.split() + for word in tqdm(words, desc=f"Tokenizing {metadata}", unit="token", leave=False, file=sys.stdout): + cleaned_word = re.sub(r'\d+', '', word) + if not cleaned_word: + continue + + doc = cltk_nlp(cleaned_word) + for processed_word in doc.words: + lemma = latin_lemmatizer.lemmatize([processed_word.string])[0][1].lower() + cleaned_lemma = re.sub(r'\d+', '', lemma) + if cleaned_lemma and cleaned_lemma not in LATIN_STOPS and cleaned_lemma not in EXTRA_STOPS and len(cleaned_lemma) > 1: + word_tokens.append(cleaned_lemma) document_tokens.append({"metadata": metadata, "tokens": word_tokens}) - # Save the tokenized output to a JSON file output_file = os.path.abspath(destination_path) with open(output_file, 'w', encoding='utf-8') as f: json.dump(document_tokens, f, ensure_ascii=False) - # Print a message to confirm that the file has been saved print(f'The tokenized output has been saved as {destination_path}') - - - - +if __name__ == '__main__': + main('source.json', 'destination.json') \ No newline at end of file diff --git a/lutherscripts/src/text_processing/nltk_do_freqanalysis.py b/lutherscripts/src/text_processing/nltk_do_freqanalysis.py index e78e1fc..d23d993 100644 --- a/lutherscripts/src/text_processing/nltk_do_freqanalysis.py +++ b/lutherscripts/src/text_processing/nltk_do_freqanalysis.py @@ -1,31 +1,32 @@ import json -import os -import nltk from nltk.probability import FreqDist -from collections import OrderedDict from tqdm import tqdm - -def main(source_path, destination_path, progress_callback=None): +def main(source_path, destination_path): # Load the tokenized text from the source file with open(source_path, 'r', encoding='utf-8') as f: - tokenized_text = json.load(f) + documents = json.load(f) # Create a frequency distribution using NLTK with a progress bar fdist = FreqDist() - for token in tqdm(tokenized_text, desc="Creating frequency distribution", unit="token"): - fdist[token] += 1 - if progress_callback: - progress_callback(fdist.N() / len(tokenized_text)) + for document in tqdm(documents, desc="Creating frequency distribution", unit="document"): + # Extract tokens from each document + tokens = document.get('tokens', []) # Ensure there's a default empty list if 'tokens' is missing + for token in tokens: + # Increment the count for each token + fdist[token] += 1 - # Sort the frequency distribution by frequency - sorted_fdist = OrderedDict(fdist.most_common()) + # Convert the frequency distribution to a dictionary for JSON serialization + fdist_dict = {word: freq for word, freq in fdist.items()} # Save the frequency distribution as a JSON file with open(destination_path, 'w', encoding='utf-8') as f: - json.dump(sorted_fdist, f, ensure_ascii=False, indent=2) + json.dump(fdist_dict, f, ensure_ascii=False, indent=2) # Print a message to confirm that the file has been saved print(f'The frequency analysis has been saved as {destination_path}') +if __name__ == '__main__': + # Example usage + main('source.json', 'destination.json') diff --git a/testrun1_wordtokenizer2.bat b/testrun1_wordtokenizer2.bat index 4c33ee1..ed57318 100644 --- a/testrun1_wordtokenizer2.bat +++ b/testrun1_wordtokenizer2.bat @@ -2,4 +2,4 @@ set PYTHONIOENCODING=utf-8 set "PYTHON_SCRIPTS_PATH=C:\Users\bg1\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\Scripts" set "PATH=%PYTHON_SCRIPTS_PATH%;%PATH%" -lutherscripts-cli -o word_tokenize_latin -s lutherscripts/txt/DSA_ready.txt -d lutherscripts/output/DSA_tokenized.json -1 10 -2 0.5 +lutherscripts-cli -o word_tokenize_latin -s python-deservoarbitrio-textanalysis/lutherscripts/txt/DSA_ready_test2.txt -d python-deservoarbitrio-textanalysis/lutherscripts/output/DSA_tokenized2.json -1 10 -2 0.5 diff --git a/testrun2_corpusbuilder.bat b/testrun2_corpusbuilder.bat index 79e8eee..5206be1 100644 --- a/testrun2_corpusbuilder.bat +++ b/testrun2_corpusbuilder.bat @@ -2,4 +2,4 @@ set PYTHONIOENCODING=utf-8 set "PYTHON_SCRIPTS_PATH=C:\Users\bg1\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\Scripts" set "PATH=%PYTHON_SCRIPTS_PATH%;%PATH%" -lutherscripts-cli -o build_corpus -s lutherscripts/output/dsa_tokenized.json -d lutherscripts/output/DSACorpus1 +lutherscripts-cli -o build_corpus -s python-deservoarbitrio-textanalysis/lutherscripts/output/dsa_march24_1.json -d python-deservoarbitrio-textanalysis/lutherscripts/output/dsa_mar24_corpustest diff --git a/testrun3_topicmodeler.bat b/testrun3_topicmodeler.bat index b87b2a9..560b7ce 100644 --- a/testrun3_topicmodeler.bat +++ b/testrun3_topicmodeler.bat @@ -2,4 +2,4 @@ set PYTHONIOENCODING=utf-8 set "PYTHON_SCRIPTS_PATH=C:\Users\bg1\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\Scripts" set "PATH=%PYTHON_SCRIPTS_PATH%;%PATH%" -lutherscripts-cli -o topic_modeling -1 10 -2 100 -3 5 -s lutherscripts/output/dsa_tokenized.json -c lutherscripts/output/DSACorpus1_corpus.mm -dc lutherscripts/output/DSACorpus1_dictionary.pkl -d lutherscripts/output/DSACorpus1_topictest.json +lutherscripts-cli -o topic_modeling -1 10 -2 100 -3 5 -s python-deservoarbitrio-textanalysis/lutherscripts/output/dsa_march24_1.json -c python-deservoarbitrio-textanalysis/lutherscripts/output/dsa_mar24_corpustest.json_corpus.mm -dc python-deservoarbitrio-textanalysis/lutherscripts/output/dsa_mar24_corpustest.json_dictionary.pkl -d python-deservoarbitrio-textanalysis/lutherscripts/output/dsa_topictest.json From 54b32bd00d4a4ac11b2a2cba50e6579a2f132d60 Mon Sep 17 00:00:00 2001 From: regularjoe <53127823+benjamsf@users.noreply.github.com> Date: Tue, 12 Mar 2024 22:59:07 +0200 Subject: [PATCH 5/6] gui fixes, still ugly though --- lutherscripts/gui.py | 97 +++++++++++++------------------------------- 1 file changed, 28 insertions(+), 69 deletions(-) diff --git a/lutherscripts/gui.py b/lutherscripts/gui.py index 9547f05..a4a153b 100644 --- a/lutherscripts/gui.py +++ b/lutherscripts/gui.py @@ -20,37 +20,23 @@ __license__ = "MIT" # This flag and queue are used for communication between threads and updating the GUI -stop_flag = False +stop_flag = [False] message_queue = queue.Queue() class CustomTextRedirector: - def __init__(self, widget, single_line_mode=False): + def __init__(self, widget): self.widget = widget - self.single_line_mode = single_line_mode - self.widget.configure(background='black', foreground='green', font=('Arial')) + self.widget.configure(background='black', foreground='green', font=('Arial', 12)) def write(self, message): if self.widget.winfo_exists(): self.widget.configure(state='normal') - if self.single_line_mode: - # Replace last line - current_content = self.widget.get("1.0", tk.END).splitlines() - current_content[-1] = message.strip() # Ensure to strip newlines for single line mode - new_content = "\n".join(current_content) - self.widget.delete("1.0", tk.END) - self.widget.insert("1.0", new_content) - else: - # Append new message - self.widget.insert(tk.END, message) + self.widget.insert(tk.END, message) self.widget.see(tk.END) self.widget.configure(state='disabled') def flush(self): - pass - - def set_single_line_mode(self, mode): - self.single_line_mode = mode - + pass # Nothing to do here for now def gui_main(): root = tk.Tk() @@ -206,83 +192,54 @@ def update_image_label(lbl, frames): frames.append(frame) lbl.config(image=frame) - def start_async_operation(): - """Start the async operation in a new thread.""" - try: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - loop.run_until_complete(run_script_async()) - except Exception as e: - print(f"An error occurred: {e}") # Print or log the error - finally: - # Ensure these actions are performed back in the main GUI thread - root.after(0, finalize_operation) - def finalize_operation(): """Re-enable the button and stop the animation after the operation is done.""" - global stop_flag - stop_flag = True # Stop the animation btn_play.configure(state='normal') # Re-enable the button + stop_flag[0] = True - def animate_luther(): - global stop_flag - while not stop_flag: + def animate_luther(stop_flag): + while not stop_flag[0]: update_image_label(lbl_luther_image, frames) root.update() time.sleep(interval) def start_operation(): - global stop_flag - stop_flag = False btn_play.configure(state='disabled') txt_terminal.configure(state='normal') txt_terminal.delete(1.0, tk.END) # Clear existing text txt_terminal.configure(state='disabled') + print("Starting the requested operation...") # Run the async operation in a separate thread - threading.Thread(target=run_script_async, daemon=True).start() + threading.Thread(target=start_async_operation, daemon=True).start() # Update the GUI periodically update_txt_terminal() - # Check the selected operation and validate arguments for KWIC analysis - operation_name = [option[0] for option in options if option[1] == var_operation.get()][0] - if operation_name == "kwic_analysis": - argument1 = ent_argument1.get() - argument2 = ent_argument2.get() - if not argument1 or not argument2: - print("Please enter both Argument1 (Keyword) and argument2 (Context length, a number of words you want to see left and right of a keyword hit) for the KWIC analysis") - return - if operation_name == "topic_modeling": - argument1 = ent_argument1.get() - argument2 = ent_argument2.get() - if not argument1 or not argument2: - print("Please enter both Argument1 (Number of Topics) and argument2 (Number of Corpus Passes during LDA Training) ") - return - async_thread = threading.Thread(target=start_async_operation, daemon=True) - async_thread.start() - # Start the animation thread stop_flag[0] = False animation_thread = threading.Thread(target=animate_luther, args=(stop_flag,)) animation_thread.daemon = True animation_thread.start() - print("Starting operation...") - print("Please wait, this might take couple of seconds...") - - btn_play.configure(state='disabled') - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) + def start_async_operation(): + """Start the async operation in a new thread.""" + def run_in_background(): + try: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(run_script_async()) + loop.close() + except Exception as e: + print(f"An error occurred: {e}") + finally: + root.after(0, finalize_operation) + + global stop_flag # Use the existing stop_flag list + stop_flag[0] = False # Update the value in the list + threading.Thread(target=run_in_background, daemon=True).start() - # Call the main function with the callback function - loop.run_until_complete(run_script_async()) - loop.close() - btn_play.configure(state='normal') - stop_flag[0] = True - async def run_script_async(): operation_name = [option[0] for option in options if option[1] == var_operation.get()][0] source_path = os.path.normpath(location_raw_sourcetext) @@ -336,6 +293,8 @@ async def run_script_async(): txt_terminal.configure(state='disabled') txt_terminal.update() + print("Operation finished.") + # Start Operation! button btn_play = tk.Button(root, text="Start Operation!", command=start_operation) btn_play.grid(row=6, column=3, padx=10, pady=10) From d5a200eb33348a68d3c6fdd2b8d241bef856b3ca Mon Sep 17 00:00:00 2001 From: regularjoe <53127823+benjamsf@users.noreply.github.com> Date: Tue, 12 Mar 2024 23:11:05 +0200 Subject: [PATCH 6/6] the script is now garbage but add support for it --- lutherscripts/cli.py | 8 +++- lutherscripts/gui.py | 6 ++- ..._createseparatefiles_from_prepared_text.py | 38 +++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 lutherscripts/src/text_preparation/python_createseparatefiles_from_prepared_text.py diff --git a/lutherscripts/cli.py b/lutherscripts/cli.py index cb941ec..5029676 100644 --- a/lutherscripts/cli.py +++ b/lutherscripts/cli.py @@ -13,7 +13,7 @@ def add_arguments(parser): - parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability", "export_docs"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler, export_docs or word_document_probability") + parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability", "export_docs", "export_prepared_text"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler, export_docs, export_prepared_text or word_document_probability") parser.add_argument("-1", "--first-detail", type=float, help="First detail flag for operation, depends on the operation") parser.add_argument("-2", "--second-detail", type=float, help="Second detail flag for operation, depends on the operation") parser.add_argument("-3", "--third-detail", type=int, help="Third detail flag for operation, depends on the operation") @@ -56,6 +56,10 @@ def export_docs(source_path, destination_path): from src.text_preparation.python_createseparatefiles_from_metadata import main as export_docs output = export_docs(source_path, destination_path) +def export_prepared_text(source_path, destination_path): + from src.text_preparation.python_createseparatefiles_from_prepared_text import main as export_prepared_text + output = export_prepared_text(source_path, destination_path) + def cli_main(): parser = argparse.ArgumentParser(description="Lutherscript operations launcher") add_arguments(parser) @@ -97,6 +101,8 @@ def cli_main(): build_corpus(source_path, destination_path, args.first_detail, args.second_detail) elif args.operation == 'export_docs': export_docs(source_path, destination_path) + elif args.operation == 'export_prepared_text': + export_prepared_text(source_path, destination_path) if __name__ == '__main__': diff --git a/lutherscripts/gui.py b/lutherscripts/gui.py index a4a153b..7087d50 100644 --- a/lutherscripts/gui.py +++ b/lutherscripts/gui.py @@ -114,7 +114,8 @@ def gui_main(): ("freq_analysis", "Perform word frequency analysis from your JSON word tokenized text"), ("build_corpus", "Build a dictionary and corpus from your JSON word tokenized text"), ("topic_modeling", "Perform Topic Modeling from your dictionary and corpus"), - ("export_docs", "Export the tokenized JSON to multiple txt documents") + ("export_docs", "Export the tokenized JSON to multiple txt documents"), + ("export_prepared_text", "Export Lutherscripts prepared txt to multiple txt documents") ] def update_explanation(*args): @@ -125,7 +126,8 @@ def update_explanation(*args): "Perform word frequency analysis from your JSON word tokenized text": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.", "Build a dictionary and corpus from your JSON word tokenized text": "This operation will build a dictionary and a corpus from your Word Tokenized text in JSON format using GenSim, for to source further operations. As Arg 1 pass minimum appearance of a word in a document corpus to be accepted to the corpus, as Arg 2 pass the maximum in a fraction of a document to do the same.", "Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy.", - "Export the tokenized JSON to multiple txt documents": "Lutherscripts Latin tokenizer will output the source to a JSON array. Export that to separate txt documents for work with tools like Voyant." + "Export the tokenized JSON to multiple txt documents": "Lutherscripts Latin tokenizer will output the source to a JSON array. Export that to separate txt documents for work with tools like Voyant.", + "Export Lutherscripts prepared txt to multiple txt documents": "Export Lutherscripts prepared text to multiple txt documents. Text prepared by Lutherscripts remove clutter from Luther's Werke im WWW / other raw text sources, and enables you to separate it to documents in a way that Lutherscripts understands. Now export the text to multiple txt files based on that markup, in order to use them in 3rd party tools like VoyantTools." } selected_operation = var_operation.get() diff --git a/lutherscripts/src/text_preparation/python_createseparatefiles_from_prepared_text.py b/lutherscripts/src/text_preparation/python_createseparatefiles_from_prepared_text.py new file mode 100644 index 0000000..fcfc28b --- /dev/null +++ b/lutherscripts/src/text_preparation/python_createseparatefiles_from_prepared_text.py @@ -0,0 +1,38 @@ +import os +import re +from tqdm import tqdm + +def main(source_path, destination_path): + # Ensure the destination directory exists + os.makedirs(destination_path, exist_ok=True) + + with open(source_path, 'r', encoding='utf-8') as f: + text = f.read() + + # Use regular expressions to find titles and split the text into documents + documents = re.split(r'#\s*([^#]+)\s*#', text) + + # Remove any leading or trailing whitespace from titles and bodies + documents = [d.strip() for d in documents] + + # Pair titles with bodies, excluding the first empty string (if any) + pairs = [(documents[i], documents[i+1]) for i in range(0, len(documents), 2) if i+1 < len(documents)] + + # Process each document with a progress bar + for i, (title, body) in tqdm(enumerate(pairs), total=len(pairs), desc="Processing documents"): + # Make sure the title is safe to use as a filename + filename_title = sanitize_filename(title) + + # Save each document as a separate .txt file, named after the title + filename = os.path.join(destination_path, f'{filename_title}.txt') + with open(filename, 'w', encoding='utf-8') as f: + f.write(body) # Write only the body without the title inside the file + + print(f'The text has been exported to separate txt documents at {destination_path}.') + +def sanitize_filename(title): + """Remove disallowed characters and shorten the filename if necessary.""" + # Replace any characters not allowed in file names with underscores + safe_title = "".join(c if c.isalnum() or c in " -_" else "_" for c in title) + # Shorten the title if it's too long for a filename + return safe_title[:250] # Filesystem limit, can be adjusted