OIM3640 · SPS-11 · Nov 7, 2025
diff --git a/.virtual_documents/Untitled.ipynb b/.virtual_documents/Untitled.ipynb
@@ -0,0 +1,173 @@
+import urllib.request
+
+def download_book():
+    url = 'https://www.gutenberg.org/cache/epub/730/pg730.txt'  # Oliver Twist
+    try:
+        with urllib.request.urlopen(url) as response:
+            text = response.read().decode('utf-8')
+            print("Download successful!")
+            return text
+    except Exception as e:
+        print("An error occurred while downloading:", e)
+        return None
+
+def main():
+    book_text = download_book()
+    if book_text:
+        # Optional: Save to file
+        with open("oliver_twist.txt", "w", encoding="utf-8") as f:
+            f.write(book_text)
+        print("Text saved to 'oliver_twist.txt'.")
+
+if __name__ == "__main__":
+    main()
+
+
+def clean_gutenberg_text(text):
+    # Remove the Project Gutenberg header and footer
+    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
+    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
+
+    start = text.find(start_marker)
+    end = text.find(end_marker)
+
+    if start != -1 and end != -1:
+        text = text[start + len(start_marker):end]
+    else:
+        print("Warning: Start or end marker not found.")
+
+    # Optional cleaning: remove special characters
+    import re
+    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keep only letters, numbers, spaces
+    text = text.lower()  # Convert to lowercase
+    return text
+
+
+
+# load and clean saved file
+
+# Load the text from file
+with open("oliver_twist.txt", "r", encoding="utf-8") as f:
+    raw_text = f.read()
+
+# Clean it
+clean_text = clean_gutenberg_text(raw_text)
+
+# Preview first 500 characters
+print(clean_text[:500])
+
+
+
+# Remove Stop Words & Count Word Frequencies
+
+!pip install nltk
+
+import nltk
+nltk.download('stopwords')
+
+
+
+# Remove Stop Words from Your Text
+
+from nltk.corpus import stopwords
+
+def remove_stop_words(text):
+    words = text.split()  # Split text into list of words
+    stop_words = set(stopwords.words('english'))  # Common English stop words
+    filtered = [word for word in words if word not in stop_words]
+    return filtered  # Returns a list of words (no stopwords)
+
+filtered_words = remove_stop_words(clean_text)
+
+# Show first 50 words after removing stop words
+print(filtered_words[:50])
+
+
+# Count Word Frequencies
+
+from collections import Counter
+
+# Count word frequency
+word_freq = Counter(filtered_words)
+
+# Show top 20 most common words
+print(word_freq.most_common(20))
+
+
+
+# Summary Statistics
+
+def calculate_summary_stats(words):
+    total_words = len(words)
+    unique_words = len(set(words))
+    avg_word_length = sum(len(word) for word in words) / total_words
+
+    print(f"Total words (after stop word removal): {total_words}")
+    print(f"Unique words: {unique_words}")
+    print(f"Average word length: {avg_word_length:.2f}")
+
+calculate_summary_stats(filtered_words)
+
+
+
+# Visualization (Bar Chart)
+
+import matplotlib.pyplot as plt
+
+
+
+def plot_top_words(word_freq, n=10):
+    common = word_freq.most_common(n)
+    words = [word for word, count in common]
+    counts = [count for word, count in common]
+
+    plt.figure(figsize=(10, 5))
+    plt.bar(words, counts)
+    plt.title(f"Top {n} Most Common Words")
+    plt.xlabel("Words")
+    plt.ylabel("Frequency")
+    plt.xticks(rotation=45)
+    plt.show()
+
+plot_top_words(word_freq, n=10)
+
+
+# Sentiment Analysis (Optional Feature)
+
+import nltk
+nltk.download('vader_lexicon')
+
+
+
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+
+# Initialize the sentiment analyzer
+analyzer = SentimentIntensityAnalyzer()
+
+# Example: test on a sentence
+sentence = "Oliver was happy to see his friend again!"
+score = analyzer.polarity_scores(sentence)
+
+print(f"Sentiment for: {sentence}")
+print(score)
+
+
+
+# Split text into lines
+lines = clean_text.split("\n")
+
+# Analyze the first 10 non-empty lines
+for line in lines:
+    line = line.strip()
+    if line:
+        score = analyzer.polarity_scores(line)
+        print(f"Line: {line[:60]}...")  # Show first 60 characters
+        print(f"Sentiment: {score}")
+        print()
+
+        # Stop after showing 10 examples
+        if lines.index(line) > 10:
+            break
+
+
+