Skip to content

Comments

Assignment 2#104

Open
avargasd154 wants to merge 3 commits intoOIM3640:mainfrom
avargasd154:main
Open

Assignment 2#104
avargasd154 wants to merge 3 commits intoOIM3640:mainfrom
avargasd154:main

Conversation

@avargasd154
Copy link

No description provided.

@avargasd154
Copy link
Author

`import random
import string
import sys
from unicodedata import category

import urllib.request

Define a list of common English stop words (AI generated)

stop_words = {
'the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'was', 'it', 'for', 'on', 'with',
'as', 'he', 'be', 'at', 'by', 'i', 'this', 'had', 'not', 'are', 'but', 'from', 'or',
'have', 'an', 'they', 'which', 'you', 'one', 'were', 'her', 'all', 'she', 'there',
'would', 'their', 'we', 'him', 'been', 'has', 'when', 'who', 'will', 'more', 'no',
'if', 'out', 'so', 'up', 'what', 'said', 'about', 'than', 'its', 'into', 'them',
'then', 'some', 'could', 'my', 'may', 'these', 'can', 'his', 'me', 'do', 'such'
}

"""East of Eden"""
url1 = 'https://www.gutenberg.org/cache/epub/75781/pg75781.txt'
try:
with urllib.request.urlopen(url1) as f:
text1 = f.read().decode('utf-8')
except Exception as e:
print("An error occurred:", e)

"""The Great Gatsby"""
url2 = 'https://www.gutenberg.org/cache/epub/64317/pg64317.txt'
try:
with urllib.request.urlopen(url2) as f:
text2 = f.read().decode('utf-8')
except Exception as e:
print("An error occurred:", e)

Summary by counting the number of times a particular words appears in the text.

def count_word_frequencies(text, remove_stop):

text = text.lower()

for char in string.punctuation:
    text = text.replace(char, ' ')

words = text.split()
word_freq = {}

for word in words:
    """"Will remove stop words and not count it"""
    if not word or (remove_stop and word in stop_words):
        continue

    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1

return word_freq

How many times do the most popular words show up?

eden_freq = count_word_frequencies(text1, remove_stop=True)
gatsby_freq = count_word_frequencies(text2, remove_stop=True)
"""Boolean function which determines if I want to count stop words"""

def frequency(item):
return item[1]

def top_words(word_freq, title, n=20):
print(f"The top {n} words in {title} are:")
sorted_items = sorted(word_freq.items(), key=frequency, reverse=True)
for word, freq in sorted_items[:n]:
print(f"{word}: {freq}")

top_words(eden_freq, "East of Eden")
top_words(gatsby_freq, "The Great Gatsby")

"""Movie Reviews"""

from imdb import Cinemagoer

# create an instance of the Cinemagoer class

ia = Cinemagoer()

# search movie

movie1 = ia.search_movie("East of Eden")[0]

print(movie1.movieID)

# Get reviews

movie1 = ia.get_movie('0048028', info=['reviews']) # Make sure to add the second argument

reviews1 = movie1.get('reviews', [])

for review in reviews1:

print(review['content'])

print()

# Get actor

James_Dean = ia.get_person('0000015')

# Get actor's movies

filmography = James_Dean.get('filmography', {})

films_as_actor = filmography.get('actor', [])

print(films_as_actor)

"""The Great Gatsby"""

# search movie

movie2 = ia.search_movie("The Great Gatsby")[0]

print(movie2.movieID)

# Get reviews

movie2 = ia.get_movie('1343092', info=['reviews']) # Make sure to add the second argument

reviews2 = movie2.get('reviews', [])

for review in reviews2:

print(review['content'])

print()

# Get actor

Leo_DiCap = ia.get_person('0000138')

# Get actor's movies

filmography = Leo_DiCap.get('filmography', {})

films_as_actor = filmography.get('actor', [])

print(films_as_actor)

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sentence = 'Software Design is my favorite class because learning Python is so cool!'
score = SentimentIntensityAnalyzer().polarity_scores(sentence)
print(score)

Output

{'neg': 0.0, 'neu': 0.614, 'pos': 0.386, 'compound': 0.7417}

from thefuzz import fuzz

print(fuzz.ratio("this is a test", "this is a test!")) # 97
print(fuzz.partial_ratio("this is a test", "this is a test!")) # 100
print(fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) # 91
print(fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) # 100

import numpy as np
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

these are the similarities computed from the previous section

S = np.asarray([[1., 0.90850572, 0.96451312, 0.97905034, 0.78340575],
[0.90850572, 1., 0.95769915, 0.95030073, 0.87322494],
[0.96451312, 0.95769915, 1., 0.98230284, 0.83381607],
[0.97905034, 0.95030073, 0.98230284, 1., 0.82953109],
[0.78340575, 0.87322494, 0.83381607, 0.82953109, 1.]])

dissimilarity is 1 minus similarity

dissimilarities = 1 - S

compute the embedding

coord = MDS(dissimilarity='precomputed').fit_transform(dissimilarities)

plt.scatter(coord[:, 0], coord[:, 1])

Label the points

for i in range(coord.shape[0]):
plt.annotate(str(i), (coord[i, :]))

plt.show()

"""Request and Open API""" # Why is my API wrong?

"""Request and Open API"""

from openai import OpenAI

from dotenv import load_dotenv

import os

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Remove this duplicate line:

# client = OpenAI()

# The correct method is completions.create or chat.completions.create

# There is no "responses.create" method in the OpenAI API

response = client.chat.completions.create(

model="gpt-4o",

messages=[{"role": "user", "content": "Write a one-sentence bedtime story about a unicorn."}]

)

# The response structure is different

print(response.choices[0].message.content)

"""Correlation with the two books"""
def chunk_text(text, num_chunks=5):
words = text.split()
chunk_size = len(words) // num_chunks
return [' '.join(words[i * chunk_size:(i + 1) * chunk_size]) for i in range(num_chunks)]

chunks1 = chunk_text(text1)
chunks2 = chunk_text(text2)

chunks = chunks1 + chunks2 # Total 10 chunks

n = len(chunks)
S = np.zeros((n, n))

for i in range(n):
for j in range(n):
# Use token_sort_ratio for better normalization
S[i][j] = fuzz.token_sort_ratio(chunks[i], chunks[j]) / 100.0

dissimilarities = 1 - S

MDS Embedding

coord = MDS(dissimilarity='precomputed', random_state=42).fit_transform(dissimilarities)

Plot

plt.figure(figsize=(10, 6))
plt.scatter(coord[:, 0], coord[:, 1], c=['blue']*5 + ['green']*5, s=100)

for i in range(coord.shape[0]):
label = f"Eden {i+1}" if i < 5 else f"Gatsby {i-4}"
plt.annotate(label, (coord[i, 0]+0.01, coord[i, 1]+0.01), fontsize=9)

plt.title("MDS Plot of East of Eden vs The Great Gatsby using Fuzzy Similarity")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(True)
plt.show()

Putting them on a list to pickle

def main():

book_files = [

"Parts/East of Eden.txt",

"Parts/The Great Gatsby.txt"

]

histograms = []

for book in book_files:

hist = process_file(book, skip_header=True)

histograms.append(hist)

print(f"Processed '{book}':")

print(f" Total words: {total_words(hist)}")

print(f" Unique words: {different_words(hist)}\n")

# Example: most common words in each book

for i, hist in enumerate(histograms):

print(f"Most common words in Book {i+1}:")

top_words = most_common(hist, excluding_stopwords=True)

for freq, word in top_words[:10]:

print(f"{word}\t{freq}")

print("\n")

# Pickle

import pickle

# Assuming you already read the texts from files

with open('Parts/East of Eden', 'r', encoding='utf-8') as f1:

east_of_eden_text = f1.read()

with open('Parts/The Great Gatsby', 'r', encoding='utf-8') as f2:

great_gatsby_text = f2.read()

# Combine both into a dictionary

books = {

"East of Eden": east_of_eden_text,

"The Great Gatsby": great_gatsby_text

}

# Save data to a pickle file

with open('books_texts.pkl', 'wb') as f:

pickle.dump(books, f)

# Load data from the pickle file later

with open('books_texts.pkl', 'rb') as f:

reloaded_books = pickle.load(f)

`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant