Assignment 2 by avargasd154 · Pull Request #104 · OIM3640/Text-Analysis-Project

avargasd154 · 2025-04-06T03:25:43Z

No description provided.

avargasd154 · 2025-04-06T03:29:48Z

`import random
import string
import sys
from unicodedata import category

import urllib.request

Define a list of common English stop words (AI generated)

stop_words = {
'the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'was', 'it', 'for', 'on', 'with',
'as', 'he', 'be', 'at', 'by', 'i', 'this', 'had', 'not', 'are', 'but', 'from', 'or',
'have', 'an', 'they', 'which', 'you', 'one', 'were', 'her', 'all', 'she', 'there',
'would', 'their', 'we', 'him', 'been', 'has', 'when', 'who', 'will', 'more', 'no',
'if', 'out', 'so', 'up', 'what', 'said', 'about', 'than', 'its', 'into', 'them',
'then', 'some', 'could', 'my', 'may', 'these', 'can', 'his', 'me', 'do', 'such'
}

"""East of Eden"""
url1 = 'https://www.gutenberg.org/cache/epub/75781/pg75781.txt'
try:
with urllib.request.urlopen(url1) as f:
text1 = f.read().decode('utf-8')
except Exception as e:
print("An error occurred:", e)

"""The Great Gatsby"""
url2 = 'https://www.gutenberg.org/cache/epub/64317/pg64317.txt'
try:
with urllib.request.urlopen(url2) as f:
text2 = f.read().decode('utf-8')
except Exception as e:
print("An error occurred:", e)

Summary by counting the number of times a particular words appears in the text.

def count_word_frequencies(text, remove_stop):

text = text.lower()

for char in string.punctuation:
    text = text.replace(char, ' ')

words = text.split()
word_freq = {}

for word in words:
    """"Will remove stop words and not count it"""
    if not word or (remove_stop and word in stop_words):
        continue

    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1

return word_freq

How many times do the most popular words show up?

eden_freq = count_word_frequencies(text1, remove_stop=True)
gatsby_freq = count_word_frequencies(text2, remove_stop=True)
"""Boolean function which determines if I want to count stop words"""

def frequency(item):
return item[1]

def top_words(word_freq, title, n=20):
print(f"The top {n} words in {title} are:")
sorted_items = sorted(word_freq.items(), key=frequency, reverse=True)
for word, freq in sorted_items[:n]:
print(f"{word}: {freq}")

top_words(eden_freq, "East of Eden")
top_words(gatsby_freq, "The Great Gatsby")

"""Movie Reviews"""

from imdb import Cinemagoer

# create an instance of the Cinemagoer class

ia = Cinemagoer()

# search movie

movie1 = ia.search_movie("East of Eden")[0]

print(movie1.movieID)

# Get reviews

movie1 = ia.get_movie('0048028', info=['reviews']) # Make sure to add the second argument

reviews1 = movie1.get('reviews', [])

for review in reviews1:

print(review['content'])

print()

# Get actor

James_Dean = ia.get_person('0000015')

# Get actor's movies

filmography = James_Dean.get('filmography', {})

films_as_actor = filmography.get('actor', [])

print(films_as_actor)

"""The Great Gatsby"""

# search movie

movie2 = ia.search_movie("The Great Gatsby")[0]

print(movie2.movieID)

# Get reviews

movie2 = ia.get_movie('1343092', info=['reviews']) # Make sure to add the second argument

reviews2 = movie2.get('reviews', [])

for review in reviews2:

print(review['content'])

print()

# Get actor

Leo_DiCap = ia.get_person('0000138')

# Get actor's movies

filmography = Leo_DiCap.get('filmography', {})

films_as_actor = filmography.get('actor', [])

print(films_as_actor)

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sentence = 'Software Design is my favorite class because learning Python is so cool!'
score = SentimentIntensityAnalyzer().polarity_scores(sentence)
print(score)

Output

{'neg': 0.0, 'neu': 0.614, 'pos': 0.386, 'compound': 0.7417}

from thefuzz import fuzz

print(fuzz.ratio("this is a test", "this is a test!")) # 97
print(fuzz.partial_ratio("this is a test", "this is a test!")) # 100
print(fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) # 91
print(fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) # 100

import numpy as np
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

these are the similarities computed from the previous section

S = np.asarray([[1., 0.90850572, 0.96451312, 0.97905034, 0.78340575],
[0.90850572, 1., 0.95769915, 0.95030073, 0.87322494],
[0.96451312, 0.95769915, 1., 0.98230284, 0.83381607],
[0.97905034, 0.95030073, 0.98230284, 1., 0.82953109],
[0.78340575, 0.87322494, 0.83381607, 0.82953109, 1.]])

dissimilarity is 1 minus similarity

dissimilarities = 1 - S

compute the embedding

coord = MDS(dissimilarity='precomputed').fit_transform(dissimilarities)

plt.scatter(coord[:, 0], coord[:, 1])

Label the points

for i in range(coord.shape[0]):
plt.annotate(str(i), (coord[i, :]))

plt.show()

"""Request and Open API""" # Why is my API wrong?

"""Request and Open API"""

from openai import OpenAI

from dotenv import load_dotenv

import os

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Remove this duplicate line:

# client = OpenAI()

# The correct method is completions.create or chat.completions.create

# There is no "responses.create" method in the OpenAI API

response = client.chat.completions.create(

model="gpt-4o",

messages=[{"role": "user", "content": "Write a one-sentence bedtime story about a unicorn."}]

)

# The response structure is different

print(response.choices[0].message.content)

"""Correlation with the two books"""
def chunk_text(text, num_chunks=5):
words = text.split()
chunk_size = len(words) // num_chunks
return [' '.join(words[i * chunk_size:(i + 1) * chunk_size]) for i in range(num_chunks)]

chunks1 = chunk_text(text1)
chunks2 = chunk_text(text2)

chunks = chunks1 + chunks2 # Total 10 chunks

n = len(chunks)
S = np.zeros((n, n))

for i in range(n):
for j in range(n):
# Use token_sort_ratio for better normalization
S[i][j] = fuzz.token_sort_ratio(chunks[i], chunks[j]) / 100.0

dissimilarities = 1 - S

MDS Embedding

coord = MDS(dissimilarity='precomputed', random_state=42).fit_transform(dissimilarities)

Plot

plt.figure(figsize=(10, 6))
plt.scatter(coord[:, 0], coord[:, 1], c=['blue']*5 + ['green']*5, s=100)

for i in range(coord.shape[0]):
label = f"Eden {i+1}" if i < 5 else f"Gatsby {i-4}"
plt.annotate(label, (coord[i, 0]+0.01, coord[i, 1]+0.01), fontsize=9)

plt.title("MDS Plot of East of Eden vs The Great Gatsby using Fuzzy Similarity")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(True)
plt.show()

Putting them on a list to pickle

def main():

book_files = [

"Parts/East of Eden.txt",

"Parts/The Great Gatsby.txt"

]

histograms = []

for book in book_files:

hist = process_file(book, skip_header=True)

histograms.append(hist)

print(f"Processed '{book}':")

print(f" Total words: {total_words(hist)}")

print(f" Unique words: {different_words(hist)}\n")

# Example: most common words in each book

for i, hist in enumerate(histograms):

print(f"Most common words in Book {i+1}:")

top_words = most_common(hist, excluding_stopwords=True)

for freq, word in top_words[:10]:

print(f"{word}\t{freq}")

print("\n")

# Pickle

import pickle

# Assuming you already read the texts from files

with open('Parts/East of Eden', 'r', encoding='utf-8') as f1:

east_of_eden_text = f1.read()

with open('Parts/The Great Gatsby', 'r', encoding='utf-8') as f2:

great_gatsby_text = f2.read()

# Combine both into a dictionary

books = {

"East of Eden": east_of_eden_text,

"The Great Gatsby": great_gatsby_text

}

# Save data to a pickle file

with open('books_texts.pkl', 'wb') as f:

pickle.dump(books, f)

# Load data from the pickle file later

with open('books_texts.pkl', 'rb') as f:

reloaded_books = pickle.load(f)

`

Assignment 2

6e8c17a

avargasd154 added 2 commits April 5, 2025 23:39

Update README.md

831e137

PY files

8cfe592

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Comments

Assignment 2#104

Assignment 2#104
avargasd154 wants to merge 3 commits intoOIM3640:mainfrom
avargasd154:main

avargasd154 commented Apr 6, 2025

Uh oh!

avargasd154 commented Apr 6, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Comments

Conversation

avargasd154 commented Apr 6, 2025

Uh oh!

avargasd154 commented Apr 6, 2025

Define a list of common English stop words (AI generated)

Summary by counting the number of times a particular words appears in the text.

How many times do the most popular words show up?

from imdb import Cinemagoer

# create an instance of the Cinemagoer class

ia = Cinemagoer()

# search movie

movie1 = ia.search_movie("East of Eden")[0]

print(movie1.movieID)

# Get reviews

movie1 = ia.get_movie('0048028', info=['reviews']) # Make sure to add the second argument

reviews1 = movie1.get('reviews', [])

for review in reviews1:

print(review['content'])

print()

# Get actor

James_Dean = ia.get_person('0000015')

# Get actor's movies

filmography = James_Dean.get('filmography', {})

films_as_actor = filmography.get('actor', [])

print(films_as_actor)

"""The Great Gatsby"""

# search movie

movie2 = ia.search_movie("The Great Gatsby")[0]

print(movie2.movieID)

# Get reviews

movie2 = ia.get_movie('1343092', info=['reviews']) # Make sure to add the second argument

reviews2 = movie2.get('reviews', [])

for review in reviews2:

print(review['content'])

print()

# Get actor

Leo_DiCap = ia.get_person('0000138')

# Get actor's movies

filmography = Leo_DiCap.get('filmography', {})

films_as_actor = filmography.get('actor', [])

print(films_as_actor)

Output

{'neg': 0.0, 'neu': 0.614, 'pos': 0.386, 'compound': 0.7417}

these are the similarities computed from the previous section

dissimilarity is 1 minus similarity

compute the embedding

Label the points

"""Request and Open API"""

from openai import OpenAI

from dotenv import load_dotenv

import os

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Remove this duplicate line:

# client = OpenAI()

# The correct method is completions.create or chat.completions.create

# There is no "responses.create" method in the OpenAI API

response = client.chat.completions.create(

model="gpt-4o",

messages=[{"role": "user", "content": "Write a one-sentence bedtime story about a unicorn."}]

)

# The response structure is different

print(response.choices[0].message.content)

MDS Embedding

Plot

Putting them on a list to pickle

def main():

book_files = [

"Parts/East of Eden.txt",

"Parts/The Great Gatsby.txt"

]

histograms = []

for book in book_files:

hist = process_file(book, skip_header=True)

histograms.append(hist)

print(f"Processed '{book}':")

print(f" Total words: {total_words(hist)}")

print(f" Unique words: {different_words(hist)}\n")

# Example: most common words in each book