Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions Assignment_files.py/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

def word_frequency_analysis(filename):
"""Creates a dictionary counting how many times a words appears"""
word_counter = {}
with open (filename, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
for word in line.split():
word_counter[word] = word_counter.get(word,0)+1
return word_counter



def text_summary_statistics(filename):
"""Run a statistical test on the text of the document."""
word_counter = word_frequency_analysis(filename)
total_words = sum(word_counter.values())
total_characters = sum(len(w)*c for w, c in word_counter.items())
unique_words = len(word_counter)

avg_word_length = total_characters/total_words if total_words else 0
vocab = unique_words/ total_words if total_words else 0

print("\n Text Summary Statistics:\n")
print("Total words:", total_words)
print("Unique words:", unique_words)
print("Average word length:", round(avg_word_length, 2))
print("Vocabulary richness:", round(vocab, 3))

return word_counter


def common_words_chart(filename, top_n= 10):
"""Creates a bar chart on the top 10 most used words in a text"""
word_counter = word_frequency_analysis(filename)
sorted_words = sorted(word_counter.items(), key= lambda x: x[1], reverse = True)[:top_n]

words = [w for w, _ in sorted_words]
count = [c for _, c in sorted_words]

plt.figure(figsize=(15,5))
plt.bar (words, count)
plt.title(f"Top {top_n} Most Frequent Words")
plt.xlabel("Words")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


def sentiment_analysis(filename):
"""Runs a Sentiment Analysis for the text """
nltk.download('vader_lexicon', quiet=True)
sia = SentimentIntensityAnalyzer()
with open (filename, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read()

sentiment_words = sia.polarity_scores(text)

print(" SENTIMENT ANALYSIS RESULTS ")
print(sentiment_words)


if __name__ == "__main__":
text_summary_statistics('final_cleaned.txt')
common_words_chart('final_cleaned.txt')
sentiment_analysis('final_cleaned.txt')
81 changes: 81 additions & 0 deletions Assignment_files.py/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import urllib.request
import string

def download_data(url, filename = 'pg1513.txt'):
"""Downloads a file from the Project Gutenberg site and saves it to the """
try:
with urllib.request.urlopen(url) as f:
text = f.read().decode('utf-8')
print(text) # for testing
with open(filename, 'w', encoding='utf-8') as out:
out.write(text)
print(f"File downloaded and saved as {filename}")
return filename
except Exception as e:
print("An error occurred:", e)
return None


def is_special_line(line):
"""Text Cleaning and Processing"""
return line.strip().startswith('*** ')


def clean_file(input_file, output_file):
"""Removing headers and footers"""
with open(input_file, encoding='utf-8', errors= 'ignore') as reader,\
open(output_file, 'w', encoding='utf-8') as writer:
for line in reader:
if is_special_line(line):
break
for line in reader:
if is_special_line(line):
break
writer.write(line)

print(f"Cleaned file save as {output_file}")


def final_clean_version(filename, output_file = 'final_cleaned.txt'):
"""Removes punctuation and makes entire text in lower caps"""
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
text= f.read()

no_punct = ''.join(ch for ch in text if ch not in string.punctuation).lower()
cleaned_lines = [''.join(line.split()) for line in no_punct.splitlines()]
cleaned_text = '\n'.join(cleaned_lines)

with open(output_file, 'w', encoding='utf-8') as f:
f.write(cleaned_text)
print (f"Final cleaned text saved as {output_file}")
return output_file




def remove_stopwords(filename):
stopwords = ["a", "of", "to", "in", "it", "is", "i", "that","was", "he",
"you", "for", "on", "with", "as", "his", "they", "be", "at", "one", "have", "this", "from",
"or", "had", "by", "not", "but", "what", "all", "were", "we", "her", "can", "an"]

with open( filename, 'r', encoding='utf-8', errors='ignore') as f:
lines= f.readlines()
new_lines= []
for line in lines:
words = [w for w in line.split() if w.lower() not in stopwords]
new_lines.append(''.join(words))

with open(filename, 'w', encoding='utf-8') as f:
f.write('\n'.join(new_lines))

print(f"Stopwords removed from the text")


if __name__ == "__main__":
url = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
raw = download_data(url)

if raw:
clean_file(raw, 'cleaned_text.txt')
final_clean_version('cleaned_text.txt')
remove_stopwords('final_cleaned.txt')
33 changes: 33 additions & 0 deletions Assignment_files.py/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from download import download_data, clean_file, final_clean_version, remove_stopwords
from analysis import text_summary_statistics, common_words_chart, sentiment_analysis
from translation import create_dictionary, translate_text_file
from similarity import text_similarity


def main():
""" Main function to join the entire diles together and run all of the analysis"""

print ("\n Text Analysis \n")

url = 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt'
raw_file = download_data(url)
if not raw_file:
return

clean_file(raw_file, "cleaned_text.txt")
final_clean_version("cleaned_text.txt")
remove_stopwords("final_cleaned.txt")

text_summary_statistics("final_cleaned.txt")
common_words_chart("final_cleaned.txt")
sentiment_analysis("final_cleaned.txt")

dictionary = create_dictionary("development/Dictionary.txt")
translate_text_file("final_cleaned.txt","translated_text.txt", dictionary)

text_similarity("final_cleaned.txt", "translated_text.txt")

print ("\n End of Analysis \n")

if __name__ == "__main__":
main()
17 changes: 17 additions & 0 deletions Assignment_files.py/similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from thefuzz import fuzz

def text_similarity(file1, file2, sample_size=10000):
"""Compates how much similarity two texts have"""
with open(file1, 'r', encoding='utf-8', errors='ignore') as f1:
text1 = f1.read(sample_size)

with open(file2, 'r', encoding='utf-8', errors='ignore') as f2:
text2 = f2.read(sample_size)

ratio = fuzz.ratio(text1, text2)

print(f"Text similarity ratio {ratio}")
return ratio

if __name__ == "__main__":
text_similarity("final_cleaned.txt", "translated_text.txt")
44 changes: 44 additions & 0 deletions Assignment_files.py/translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import string

def create_dictionary(filename):
"""Creates a list from a file to a python dictionary"""
dict_conversion ={}
with open(filename, 'r', encoding= 'utf-8', errors = 'ignore') as f:
for line in f:

parts = line.strip().lower().split('-')
if len(parts) == 2:
dict_conversion[parts[0].strip()]= parts[1].strip()

print (f"{len(dict_conversion)} translation words.")
return dict_conversion

def translate_words(word, dictionary):
"""Translate the words from the original text with the new created dictionary"""
cleaned = word.strip(string.punctuation).lower()
return dictionary.get(cleaned, word)


def translate_text_file(input_file, output_file, dictionary):
"""Translate the entire text file, creating an new entire translated text file"""
with open (input_file, 'r', encoding = 'utf-8', errors = 'ignore') as f:
lines = f.readlines()

translated_words =[]
for line in lines:
words = line.split()
translated_line= ' '.join(translate_words(w, dictionary)for w in words)
translated_words.append(translated_line)

with open(output_file, 'w', encoding= 'utf-8') as f:
f.write ('\n'.join(translated_words))

print (f"Translation saved to {output_file}")


if __name__ == "__main__":
dictionary = create_dictionary("development/Dictionary.txt")
translate_text_file("final_cleaned.txt", "translated_text.txt", dictionary)



68 changes: 67 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,69 @@
# Text-Analysis-Project

Please read the [instructions](instructions.md).

#Project Write Up and Reflection

#1. Project Overview

This project focuses on doing an overview of the play 'Romeo and Juliet' written by William Shakespear. The functions of the code focused on removing the data set from the API 'Project Gutenber'. The overall project focused on cleaning and polishing all of the dataset, this includes removing all of the punctuations, stopwords, make all the characters be in lower case and remove any uncessary information from the data set. All of this was done using computational methods to also then run an analysis, such like a sentiment analysis, to analyze different information from the datas set. After running those analysis methods, I then introduced a document containing all of the translations of Old English words to New English words and translated the original file to an updated version with modern english words. Throughout the entire process I used several functions such as tex cleaning, stopword removal, word frequency analysis, text summary, visualisation and sentiment analysis, using the NLTK and TheFuzz package. My goal through this project was to identify different ways in which texts can be analyzed and identify if there was a potential way to introduce another data set to make the book easier for people to understand.


#2.Implementation

I started the project using an .ipynb file where I tested all of the data and went through the creation process of the code. This allowed me to check where functions could be improved or if there was any error that needed to be fixed. This made it easier for me to then pass the code to a .py file and run the final version of the project. After finishing drafting the code, I created several python modules to organize the data more clearly.

For data structures I focused on mainly using dictionaries to then be able to store the word frequencies of the data sets and run the translation for the text. This approach made it easier because it allowed me to keep adding information whilst still running all of the regressions. I also chose to separate all of the data into multiple .py modules as it kept the code cleaner and allows for independent testing of each section before merging it all up. I also used the help from Chat GPT in various stages of this process to help me draft functions that I wasnt fully aware how they worked (my reasoning on how they work are on the comments in the .ipynb file) and it also helped me with debugging issues. The use of a virtual assistant allowed me to restructure my code more clearly and making sure it ran all of the functions.

#3. Results

As mentioned, I runned several .py methods that produced different analysis and results. I created five python modules: download.py which focuses on downloading, cleaning and processing the file; analysis.py which focuses on running all of the analysis formulas such as a bar chart visualisation, sentiment analysis etc. ; translation.py which focuses on reading the external dictionary file to then translate the old english words in the data files to the new english word file; similarity.py focuses on running a text similarity between both texts, old and new; and finally the main.py document which combines all of the formulas into one main document and process all of the text and runs all of the analysis in one file.

These allowed me to identify the following results:

Top 10 most common words:

and : 733
the : 688
my : 355
romeo : 298
thou : 277
me : 263
juliet : 178
thy : 170
o : 149
will : 148

Text Summary Statistics:

Total words: 19309
Unique words: 3819
Average word length: 4.76
Average sentence length (in words): 0
Vocabulary richness: 0.198

SENTIMENT ANALYSIS RESULTS
Negative: 0.137
Neutral: 0.7
Positive: 0.163
Overall Compound Score: 1.0

→ The overall sentiment of the text is Positive. (this was a very surprising result given the tragic connotation of Romeo and Juliet)

TEXT SIMILARITY ANALYSIS
Basic Ratio: 98
Partial Ratio: 98
Token Sort Ratio: 96
Token Set Ratio: 99

→ The two texts are very similar.

The bar graph helped plot the frequency of the most common words, I just cannot add an image to the text.

#4. Reflection

The process of this porject managed to work very well as I managed to get the dictionary to translate the words and run all of the analysis functions on the text. Also separating the function on an individual python module made it easier to manage the data and work step by step. I believe that my biggest challenge in this assignment was to ensure that the transaltion worked correctly specific with all of the punctuation and spacing. The most complicated words to translate where the ones that had an apostrophe such as 'they'd' because when I removed all of the punctuation from the text it made it challenging for the dictionary to translate these words. Thats why I had to go back again to the code and make sure that only certain punctuation symbols were removed and not all of them. Another challenge was performance when comparing long texts using TheFuzz; I learned to limit the text sample size for efficiency as it was taking a lot of minutes to compare both texts together.

My biggest takeaway from this project was that I realized how important it is to cleand and process data such as real-world text before running analysis regressions because if not the results given by the functions won't provide an accurate measurement. I also learned (and was surprised) that natural laguage processing tools such the NLTK package can help analyze the emotional tone of a novel and focus on the emotional perspective of a dataset rather than just doing statistical analysis. For this entire process, using a virtual assistant such as Chat GPT was very helpful to complete certain functions and uderstand the reasoning behind different approaches, like dictionaries for word frequency or splitting the project into modules, to get the results I wanted to obtain. The help from the virtual assistant also helped me debug the code and fix errors that my function had, making the overall creation process more effective.

If I were to think of a recommendation to improve this project, I would like to expand the dictionary for a more comprehensive translation for all of Shakespear's plays. At the same time I believe it would be really interesting to create a proper python dictionary that can be implemented for multiple files. I would also recommend implementing the Markov text synthesis to generate sentences that use the same english style as William Shakespear does, that way new texts could also be generated with old english words to make more formal texts. Overall, I believe this project was really interesting to implement and it provided useful findings that taught me the importance of data cleaning, data processing and modular design.

Loading