OIM3640 · ngaige · Nov 8, 2025 · Nov 8, 2025
diff --git a/Assignment_files.py/analysis.py b/Assignment_files.py/analysis.py
@@ -0,0 +1,68 @@
+import matplotlib.pyplot as plt
+import nltk
+from nltk.sentiment import SentimentIntensityAnalyzer
+
+def word_frequency_analysis(filename):
+    """Creates a dictionary counting how many times a words appears"""
+    word_counter = {}
+    with open (filename, 'r', encoding='utf-8', errors='ignore') as f: 
+        for line in f:
+            for word in line.split(): 
+                word_counter[word] = word_counter.get(word,0)+1
+        return word_counter
+
+
+
+def text_summary_statistics(filename):
+    """Run a statistical test on the text of the document."""
+    word_counter = word_frequency_analysis(filename)
+    total_words = sum(word_counter.values())
+    total_characters = sum(len(w)*c for w, c in word_counter.items())
+    unique_words = len(word_counter)
+
+    avg_word_length = total_characters/total_words if total_words else 0
+    vocab = unique_words/ total_words if total_words else 0
+
+    print("\n Text Summary Statistics:\n")
+    print("Total words:", total_words)
+    print("Unique words:", unique_words)
+    print("Average word length:", round(avg_word_length, 2))
+    print("Vocabulary richness:", round(vocab, 3))
+
+    return word_counter
+
+
+def common_words_chart(filename, top_n= 10):
+    """Creates a bar chart on the top 10 most used words in a text"""
+    word_counter = word_frequency_analysis(filename)
+    sorted_words = sorted(word_counter.items(), key= lambda x: x[1], reverse = True)[:top_n]
+
+    words = [w for w, _ in sorted_words]   
+    count = [c for _, c in sorted_words]  
+
+    plt.figure(figsize=(15,5))  
+    plt.bar (words, count) 
+    plt.title(f"Top {top_n} Most Frequent Words")
+    plt.xlabel("Words") 
+    plt.ylabel("Count") 
+    plt.xticks(rotation=45) 
+    plt.show() 
+
+
+def sentiment_analysis(filename):
+    """Runs a Sentiment Analysis for the text """
+    nltk.download('vader_lexicon', quiet=True)
+    sia = SentimentIntensityAnalyzer()
+    with open (filename, 'r', encoding='utf-8', errors='ignore') as f: 
+        text = f.read()
+
+    sentiment_words = sia.polarity_scores(text)
+
+    print("   SENTIMENT ANALYSIS RESULTS    ")
+    print(sentiment_words)
+
+
+if __name__ == "__main__":
+    text_summary_statistics('final_cleaned.txt')
+    common_words_chart('final_cleaned.txt')
+    sentiment_analysis('final_cleaned.txt')
diff --git a/Assignment_files.py/download.py b/Assignment_files.py/download.py
@@ -0,0 +1,81 @@
+import urllib.request
+import string
+
+def download_data(url, filename = 'pg1513.txt'):
+    """Downloads a file from the Project Gutenberg site and saves it to the """
+    try:
+        with urllib.request.urlopen(url) as f:
+            text = f.read().decode('utf-8')
+            print(text)  # for testing
+        with open(filename, 'w', encoding='utf-8') as out:
+            out.write(text)
+        print(f"File downloaded and saved as {filename}")
+        return filename
+    except Exception as e:
+        print("An error occurred:", e)
+        return None 
+
+
+def is_special_line(line): 
+    """Text Cleaning and Processing"""
+    return line.strip().startswith('*** ')
+
+
+def clean_file(input_file, output_file):
+    """Removing headers and footers"""
+    with open(input_file, encoding='utf-8', errors= 'ignore') as reader,\
+        open(output_file, 'w', encoding='utf-8') as writer: 
+        for line in reader: 
+            if is_special_line(line):
+                break 
+        for line in reader: 
+            if is_special_line(line): 
+                break 
+            writer.write(line)
+
+    print(f"Cleaned file save as {output_file}")
+
+
+def final_clean_version(filename, output_file = 'final_cleaned.txt'):
+    """Removes punctuation and makes entire text in lower caps"""
+    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
+        text= f.read()
+
+    no_punct = ''.join(ch for ch in text if ch not in string.punctuation).lower()
+    cleaned_lines = [''.join(line.split()) for line in no_punct.splitlines()]
+    cleaned_text = '\n'.join(cleaned_lines)
+
+    with open(output_file, 'w', encoding='utf-8') as f: 
+        f.write(cleaned_text)
+    print (f"Final cleaned text saved as {output_file}")
+    return output_file
+
+
+
+
+def remove_stopwords(filename):
+    stopwords = ["a", "of", "to", "in", "it", "is", "i", "that","was", "he", 
+        "you", "for", "on", "with", "as", "his", "they", "be", "at", "one", "have", "this", "from", 
+        "or", "had", "by", "not", "but", "what", "all", "were", "we", "her", "can", "an"]
+
+    with open( filename, 'r', encoding='utf-8', errors='ignore') as f:
+        lines= f.readlines()
+    new_lines= []
+    for line in lines: 
+        words = [w for w in line.split() if w.lower() not in stopwords]
+        new_lines.append(''.join(words))
+
+    with open(filename, 'w', encoding='utf-8') as f: 
+        f.write('\n'.join(new_lines))
+
+    print(f"Stopwords removed from the text")
+
+
+if __name__ == "__main__":
+    url = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
+    raw = download_data(url)
+
+    if raw: 
+        clean_file(raw, 'cleaned_text.txt')
+        final_clean_version('cleaned_text.txt')
+        remove_stopwords('final_cleaned.txt')
diff --git a/Assignment_files.py/main.py b/Assignment_files.py/main.py
@@ -0,0 +1,33 @@
+from download import download_data, clean_file, final_clean_version, remove_stopwords
+from analysis import text_summary_statistics, common_words_chart, sentiment_analysis
+from translation import create_dictionary, translate_text_file
+from similarity import text_similarity
+
+
+def main(): 
+    """ Main function to join the entire diles together and run all of the analysis"""
+
+    print ("\n   Text Analysis \n")
+
+    url = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
+    raw_file = download_data(url)
+    if not raw_file: 
+        return 
+
+    clean_file(raw_file, "cleaned_text.txt")
+    final_clean_version("cleaned_text.txt")
+    remove_stopwords("final_cleaned.txt")
+
+    text_summary_statistics("final_cleaned.txt")
+    common_words_chart("final_cleaned.txt")
+    sentiment_analysis("final_cleaned.txt")
+
+    dictionary = create_dictionary("development/Dictionary.txt")
+    translate_text_file("final_cleaned.txt","translated_text.txt", dictionary)
+
+    text_similarity("final_cleaned.txt", "translated_text.txt")
+
+    print ("\n End of Analysis \n")
+
+if __name__ == "__main__":
+    main()
diff --git a/Assignment_files.py/similarity.py b/Assignment_files.py/similarity.py
@@ -0,0 +1,17 @@
+from thefuzz import fuzz
+
+def text_similarity(file1, file2, sample_size=10000):
+    """Compates how much similarity two texts have"""
+    with open(file1, 'r', encoding='utf-8', errors='ignore') as f1: 
+        text1 = f1.read(sample_size)
+
+    with open(file2, 'r', encoding='utf-8', errors='ignore') as f2: 
+        text2 = f2.read(sample_size)
+
+    ratio = fuzz.ratio(text1, text2)
+
+    print(f"Text similarity ratio {ratio}")
+    return ratio
+
+if __name__ == "__main__":
+    text_similarity("final_cleaned.txt", "translated_text.txt")
diff --git a/Assignment_files.py/translation.py b/Assignment_files.py/translation.py
@@ -0,0 +1,44 @@
+import string
+
+def create_dictionary(filename):
+    """Creates a list from a file to a python dictionary"""
+    dict_conversion ={}
+    with open(filename, 'r', encoding= 'utf-8', errors = 'ignore') as f: 
+        for line in f: 
+
+            parts = line.strip().lower().split('-')
+            if len(parts) == 2:
+                dict_conversion[parts[0].strip()]= parts[1].strip()
+
+    print (f"{len(dict_conversion)} translation words.")
+    return dict_conversion
+
+def translate_words(word, dictionary):
+    """Translate the words from the original text with the new created dictionary"""
+    cleaned = word.strip(string.punctuation).lower()
+    return dictionary.get(cleaned, word)
+
+
+def translate_text_file(input_file, output_file, dictionary):
+    """Translate the entire text file, creating an new entire translated text file"""
+    with open (input_file, 'r', encoding = 'utf-8', errors = 'ignore') as f:
+        lines = f.readlines()
+
+    translated_words =[]
+    for line in lines: 
+        words = line.split()
+        translated_line= ' '.join(translate_words(w, dictionary)for w in words)
+        translated_words.append(translated_line)
+
+    with open(output_file, 'w', encoding= 'utf-8') as f: 
+        f.write ('\n'.join(translated_words))
+
+    print (f"Translation saved to {output_file}")
+
+
+if __name__ == "__main__":
+    dictionary = create_dictionary("development/Dictionary.txt")
+    translate_text_file("final_cleaned.txt", "translated_text.txt", dictionary)
+
+
+
diff --git a/README.md b/README.md
@@ -1,3 +1,69 @@
 # Text-Analysis-Project
-
 Please read the [instructions](instructions.md).
+
+#Project Write Up and Reflection 
+
+#1. Project Overview 
+
+This project focuses on doing an overview of the play 'Romeo and Juliet' written  by William Shakespear. The functions of the code focused on removing the data set from the API 'Project Gutenber'. The overall project focused on cleaning and polishing all of the dataset, this includes removing all of the punctuations, stopwords, make all the characters be in lower case and remove any uncessary information from the data set. All of this was done using computational methods to also then run an analysis, such like a sentiment analysis, to analyze different information from the datas set. After running those analysis methods, I then introduced a document containing all of the translations of Old English words to New English words and translated the original file to an updated version with modern english words. Throughout the entire process I used several functions such as tex cleaning, stopword removal, word frequency analysis, text summary, visualisation and sentiment analysis, using the NLTK and TheFuzz package. My goal through this project was to identify different ways in which texts can be analyzed and identify if there was a potential way to introduce another data set to make the book easier for people to understand. 
+
+
+#2.Implementation 
+
+I started the project using an .ipynb file where I tested all of the data and went through the creation process of the code. This allowed me to check where functions could be improved or if there was any error that needed to be fixed. This made it easier for me to then pass the code to a .py file and run the final version of the project. After finishing drafting the code, I created several python modules to organize the data more clearly. 
+
+For data structures I focused on mainly using dictionaries to then be able to store the word frequencies of the data sets and run the translation for the text. This approach made it easier because it allowed me to keep adding information whilst still running all of the regressions. I also chose to separate all of the data into multiple .py modules as it kept the code cleaner and allows for independent testing of each section before merging it all up. I also used the help from Chat GPT in various stages of this process to help me draft functions that I wasnt fully aware how they worked (my reasoning on how they work are on the comments in the .ipynb file) and it also helped me with debugging issues. The use of a virtual assistant allowed me to restructure my code more clearly and making sure it ran all of the functions. 
+
+#3. Results 
+
+As mentioned, I runned several .py methods that produced different analysis and results. I created five python modules: download.py which focuses on downloading, cleaning and processing the file; analysis.py which focuses on running all of the analysis formulas such as a bar chart visualisation, sentiment analysis etc. ; translation.py which focuses on reading the external dictionary file to then translate the old english words in the data files to the new english word file; similarity.py focuses on running a text similarity between both texts, old and new; and finally the main.py document which combines all of the formulas into one main document and process all of the text and runs all of the analysis in one file. 
+
+These allowed me to identify the following results: 
+
+Top 10 most common words:
+
+and : 733
+the : 688
+my : 355
+romeo : 298
+thou : 277
+me : 263
+juliet : 178
+thy : 170
+o : 149
+will : 148
+
+ Text Summary Statistics:
+
+Total words: 19309
+Unique words: 3819
+Average word length: 4.76
+Average sentence length (in words): 0
+Vocabulary richness: 0.198
+
+    SENTIMENT ANALYSIS RESULTS    
+Negative: 0.137
+Neutral:  0.7
+Positive: 0.163
+Overall Compound Score: 1.0
+
+→ The overall sentiment of the text is Positive. (this was a very surprising result given the tragic connotation of Romeo and Juliet)
+
+TEXT SIMILARITY ANALYSIS    
+Basic Ratio: 98
+Partial Ratio: 98
+Token Sort Ratio: 96
+Token Set Ratio: 99
+
+→ The two texts are very similar.
+
+The bar graph helped plot the frequency of the most common words, I just cannot add an image to the text.
+
+#4. Reflection 
+
+The process of this porject managed to work very well as I managed to get the dictionary to translate the words and run all of the analysis functions on the text. Also separating the function on an individual python module made it easier to manage the data and work step by step. I believe that my biggest challenge in this assignment was to ensure that the transaltion worked correctly specific with all of the punctuation and spacing. The most complicated words to translate where the ones that had an apostrophe such as 'they'd' because when I removed all of the punctuation from the text it made it challenging for the dictionary to translate these words. Thats why I had to go back again to the code and make sure that only certain punctuation symbols were removed and not all of them. Another challenge was performance when comparing long texts using TheFuzz; I learned to limit the text sample size for efficiency as it was taking a lot of minutes to compare both texts together. 
+
+My biggest takeaway from this project was that I realized how important it is to cleand and process data such as real-world text before running analysis regressions because if not the results given by the functions won't provide an accurate measurement. I also learned (and was surprised) that natural laguage processing tools such the NLTK package can help analyze the emotional tone of a novel and focus on the emotional perspective of a dataset rather than just doing statistical analysis. For this entire process, using a virtual assistant such as Chat GPT was very helpful to complete certain functions and uderstand the reasoning behind different approaches, like dictionaries for word frequency or splitting the project into modules, to get the results I wanted to obtain. The help from the virtual assistant also helped me debug the code and fix errors that my function had, making the overall creation process more effective. 
+
+If I were to think of a recommendation to improve this project, I would like to expand the dictionary for a more comprehensive translation for all of Shakespear's plays. At the same time I believe it would be really interesting to create a proper python dictionary that can be implemented for multiple files. I would also recommend implementing the Markov text synthesis to generate sentences that use the same english style as William Shakespear does, that way new texts could also be generated with old english words to make more formal texts. Overall, I believe this project was really interesting to implement and it provided useful findings that taught me the importance of data cleaning, data processing and modular design. 
+