diff --git a/README.md b/README.md index 05aa109..3c676e1 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,52 @@ # Text-Analysis-Project -Please read the [instructions](instructions.md). +# 1. Project Overview + +For this project, I used the text of *Oliver Twist* by Charles Dickens, which I accessed from Project Gutenberg. My main goal was to explore how to harvest a real book from the internet and then analyze its contents using basic text processing techniques. I wanted to identify the most frequently used words and also evaluate the overall tone or sentiment of the text. I used Python to clean the text, filter out common "stop words," count word frequencies, and perform sentiment analysis. Through this process, I hoped to improve my skills in working with text data and learn how to apply natural language processing techniques in a real-world context. + +# 2. Implementation + +I structured the project into three main Python files: +- `harvest_text.py`: handles downloading the text from a URL +- `analyze.py`: contains all the text processing and analysis functions +- `main.py`: serves as the entry point that ties everything together and prints results + +In `harvest_text.py`, I used `urllib.request` to download the raw text from the provided Project Gutenberg link. This part was fairly straightforward, and I followed the structure given in the instructions. + +In `analyze.py`, I wrote functions for word cleaning, stop word removal, word frequency counting, and sentiment analysis. I originally planned to just use `.split()` to tokenize words, but that included punctuation and other symbols. After getting stuck, I asked ChatGPT to help me understand a regular expression that would return only clean words, which led to using `re.findall(r'\b[a-z]+\b', text)`. I also used the `nltk` library for stopword filtering and sentiment analysis. I had to install and download the necessary NLTK resources (`stopwords` and `vader_lexicon`), and I used the `SentimentIntensityAnalyzer` to get sentiment scores. GPT helped me understand how to interpret the output from this tool and what each score meant. + +In `main.py`, I called all the functions in a logical sequence: download the text, extract words, filter stopwords, count word frequencies, and run sentiment analysis. The final result prints out the top 10 most common words and the sentiment scores for the entire book. + +# 3. Results + +After processing *Oliver Twist*, I was able to identify the most frequently used non-stopwords. Most of them are key characters or common narrative terms, which makes sense in the context of a novel. Here’s the output of the top 10 words: + +Top 10 words (after removing stopwords): +said: 1233 +mr: 1080 +oliver: 880 +upon: 481 +one: 466 +replied: 464 +old: 450 +would: 413 +man: 398 +bumble: 397 + +For sentiment analysis, I used `nltk`’s built-in VADER tool. This gave me a breakdown of how positive, neutral, or negative the overall tone of the book was. Given the darker themes in *Oliver Twist*, I expected it to lean more negative or neutral. Here’s what the sentiment scores looked like: + +Running sentiment analysis... +Sentiment scores: +neg: 0.092 +neu: 0.784 +pos: 0.124 +compound: 1.0 + +The scores suggest that while the text contains some negativity(0.092), it’s overall fairly neutral in tone with a score of 0.784. The compound score is close to 1.0, which might reflect the presence of both light and dark moments in the story. + +# 4. Reflection + +Overall, I’m really happy with how the project turned out. I was able to write most of the code myself and organize the project in a clean and modular way. The hardest part for me was working with regular expressions — I wasn’t sure how to extract only the real words from the text at first. I also wasn’t confident in how sentiment analysis worked, but ChatGPT helped explain both of these parts clearly. It didn’t write everything for me, but it made it easier to understand tricky concepts that I probably would’ve struggled with on my own. + +From a learning perspective, I now feel more confident using libraries like `nltk` and working with large text datasets. I also learned how important it is to clean and structure text properly before trying to analyze it. If I were to continue the project, I’d probably add a second book for comparison or try visualizing the word frequencies with a bar chart. I feel like the scope was very appropriate for my level and I was able to test my code frequently to make sure each step was working correctly. + diff --git a/analyze.py b/analyze.py new file mode 100644 index 0000000..0678b64 --- /dev/null +++ b/analyze.py @@ -0,0 +1,45 @@ +# This lets us use regular expressions, which help us extract just the words from a big chunk of text. +import re +# Counter is a really useful tool that helps count how many times each word shows up. +from collections import Counter +# nltk is a popular Python library that helps us work with natural language (text data). +import nltk +# This gives us a list of very common English words like "the", "and", "is" that we usually want to ignore. +from nltk.corpus import stopwords +# This tool helps us measure the "mood" or sentiment of the text — whether it sounds positive, negative, or neutral. +from nltk.sentiment.vader import SentimentIntensityAnalyzer + +# These two lines download some tools from nltk the first time we run the code. +nltk.download('stopwords') # downloads the list of stopwords +nltk.download('vader_lexicon') # downloads the sentiment scoring tool + +# This function breaks a big string of text into a list of individual words. +def get_words(text): + text = text.lower() + # This line uses a regular expression to grab only actual words made of letters. + # I wasn’t totally sure how this worked, so I used GPT to help me understand what \b[a-z]+\b means + # At first I was thinking of using split to break the text into words, but that would have included punctuation + words = re.findall(r'\b[a-z]+\b', text) + return words + +# This function removes stop words +def remove_stopwords(words): + # Get the default list of stopwords from nltk. + stop_words = set(stopwords.words('english')) + # Keep only the words that are NOT in the stopwords list. + filtered = [word for word in words if word not in stop_words] + return filtered + +# This function counts how many times each word appears. +def count_words(words): + # This uses Python’s Counter class to count things. Very useful for word counts. + return Counter(words) + +# This function checks how positive, negative, or neutral the overall text is. +def analyze_sentiment(text): + # Create a sentiment analyzer (provided by nltk). + sid = SentimentIntensityAnalyzer() + # Get a dictionary of sentiment scores for the text. + # It returns values like: {'neg': 0.1, 'neu': 0.6, 'pos': 0.3, 'compound': 0.4} + # I used GPT to learn how to interpret these scores and what they mean. + return sid.polarity_scores(text) diff --git a/harvest_text.py b/harvest_text.py new file mode 100644 index 0000000..a18261c --- /dev/null +++ b/harvest_text.py @@ -0,0 +1,11 @@ +# I got this code directly from the instructions.md +import urllib.request + +def harvest_gutenberg_text(url): + try: + with urllib.request.urlopen(url) as f: + text = f.read().decode('utf-8') + return text + except Exception as e: + print("An error occurred:", e) + return "" diff --git a/main.py b/main.py new file mode 100644 index 0000000..abff03b --- /dev/null +++ b/main.py @@ -0,0 +1,31 @@ +# These functions are the ones we defined in analyzer.py +from analyze import get_words, remove_stopwords, count_words, analyze_sentiment +from harvest_text import harvest_gutenberg_text + +def main(): + # This is the Project Gutenberg link to Oliver Twist + url = "https://www.gutenberg.org/cache/epub/730/pg730.txt" + print("Downloading text from Project Gutenberg...") + # Fetch the book text from the web + raw_text = harvest_gutenberg_text(url) + # Break the text into a list of words + words = get_words(raw_text) + # Remove common words + filtered_words = remove_stopwords(words) + # Count how often each word appears + word_counts = count_words(filtered_words) + + # Print the 10 most common words and their counts + print("\nTop 10 words (after removing stopwords):") + for word, count in word_counts.most_common(10): + print(f"{word}: {count}") + + # Use sentiment analysis to see if the overall tone is positive/negative/neutral + print("\nRunning sentiment analysis...") + sentiment_scores = analyze_sentiment(raw_text) + print("Sentiment scores:") + for category, score in sentiment_scores.items(): + print(f"{category}: {score}") + +if __name__ == "__main__": + main()