Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions .virtual_documents/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import urllib.request

def download_book():
url = 'https://www.gutenberg.org/cache/epub/730/pg730.txt' # Oliver Twist
try:
with urllib.request.urlopen(url) as response:
text = response.read().decode('utf-8')
print("Download successful!")
return text
except Exception as e:
print("An error occurred while downloading:", e)
return None

def main():
book_text = download_book()
if book_text:
# Optional: Save to file
with open("oliver_twist.txt", "w", encoding="utf-8") as f:
f.write(book_text)
print("Text saved to 'oliver_twist.txt'.")

if __name__ == "__main__":
main()


def clean_gutenberg_text(text):
# Remove the Project Gutenberg header and footer
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

start = text.find(start_marker)
end = text.find(end_marker)

if start != -1 and end != -1:
text = text[start + len(start_marker):end]
else:
print("Warning: Start or end marker not found.")

# Optional cleaning: remove special characters
import re
text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Keep only letters, numbers, spaces
text = text.lower() # Convert to lowercase
return text



# load and clean saved file

# Load the text from file
with open("oliver_twist.txt", "r", encoding="utf-8") as f:
raw_text = f.read()

# Clean it
clean_text = clean_gutenberg_text(raw_text)

# Preview first 500 characters
print(clean_text[:500])



# Remove Stop Words & Count Word Frequencies

!pip install nltk

import nltk
nltk.download('stopwords')



# Remove Stop Words from Your Text

from nltk.corpus import stopwords

def remove_stop_words(text):
words = text.split() # Split text into list of words
stop_words = set(stopwords.words('english')) # Common English stop words
filtered = [word for word in words if word not in stop_words]
return filtered # Returns a list of words (no stopwords)

filtered_words = remove_stop_words(clean_text)

# Show first 50 words after removing stop words
print(filtered_words[:50])


# Count Word Frequencies

from collections import Counter

# Count word frequency
word_freq = Counter(filtered_words)

# Show top 20 most common words
print(word_freq.most_common(20))



# Summary Statistics

def calculate_summary_stats(words):
total_words = len(words)
unique_words = len(set(words))
avg_word_length = sum(len(word) for word in words) / total_words

print(f"Total words (after stop word removal): {total_words}")
print(f"Unique words: {unique_words}")
print(f"Average word length: {avg_word_length:.2f}")

calculate_summary_stats(filtered_words)



# Visualization (Bar Chart)

import matplotlib.pyplot as plt



def plot_top_words(word_freq, n=10):
common = word_freq.most_common(n)
words = [word for word, count in common]
counts = [count for word, count in common]

plt.figure(figsize=(10, 5))
plt.bar(words, counts)
plt.title(f"Top {n} Most Common Words")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()

plot_top_words(word_freq, n=10)


# Sentiment Analysis (Optional Feature)

import nltk
nltk.download('vader_lexicon')



from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Example: test on a sentence
sentence = "Oliver was happy to see his friend again!"
score = analyzer.polarity_scores(sentence)

print(f"Sentiment for: {sentence}")
print(score)



# Split text into lines
lines = clean_text.split("\n")

# Analyze the first 10 non-empty lines
for line in lines:
line = line.strip()
if line:
score = analyzer.polarity_scores(line)
print(f"Line: {line[:60]}...") # Show first 60 characters
print(f"Sentiment: {score}")
print()

# Stop after showing 10 examples
if lines.index(line) > 10:
break



Loading