diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index dfe0770..0000000 --- a/.gitattributes +++ /dev/null @@ -1,2 +0,0 @@ -# Auto detect text files and perform LF normalization -* text=auto diff --git a/.gitignore b/.gitignore index d9005f2..61cc06c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,152 +1,36 @@ -# Byte-compiled / optimized / DLL files +# Python __pycache__/ *.py[cod] *$py.class - -# C extensions *.so - -# Distribution / packaging .Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints -# IPython -profile_default/ -ipython_config.py +# Cache files +reviews_cache.pkl +*.pkl -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version +# Generated output files +review_wordfreqs.tsv +example-output.txt -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock +# OS files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock +# IDE +.vscode/ +.idea/ +*.swp +*.swo -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ +# Environment venv/ +env/ ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ -# Cython debug symbols -cython_debug/ +# Kaggle dataset cache (if downloaded locally) +imdb_dataset.csv -# PyCharm -# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/README.md b/README.md index 05aa109..db59890 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,200 @@ -# Text-Analysis-Project - -Please read the [instructions](instructions.md). +# IMDb Reviews Analyzer + +## 1. Project Overview + +This project is an IMDb Reviews Analyzer built with Python that processes and analyzes movie review text from the internet. The project was initially designed to use the Cinemagoer library to fetch user reviews directly from IMDb, but encountered a limitation: Cinemagoer's review endpoint is currently nonfunctional (as of 2025), with the `reviews` info key returning empty results even when movie metadata is successfully retrieved. + +To work around this limitation while still meeting the assignment requirement of harvesting text from an internet source, the project switched to using a Kaggle dataset containing 50,000 IMDb movie reviews (`lakshmi25npathi/imdb-dataset-of-50k-movie-reviews`). This dataset provides real IMDb review text that can be programmatically accessed and analyzed. + +The project's primary goals are to: +- Clean and preprocess text data (removing HTML tags, normalizing punctuation, handling encoding) +- Perform word frequency analysis to identify the most common terms in reviews +- Compute summary statistics (token count, vocabulary size, average word length, type-token ratio) +- Visualize results using ASCII bar charts (avoiding matplotlib as recommended) +- Optionally perform sentiment analysis using VADER (NLTK) to gauge emotional tone of reviews + +The analyzer processes reviews for multiple movies (e.g., "The Dark Knight", "Barbie", "Oppenheimer") and generates comparative word frequency statistics and visualizations. + +## 2. Implementation + +The project follows a modular architecture with clear separation of concerns across four main Python files: + +### Architecture + +**`main.py`** - Entry point that orchestrates the analysis pipeline. Uses the `if __name__ == "__main__"` idiom to enable both script execution and module import. Coordinates data fetching and analysis for multiple movie titles. + +**`reviews_fetcher.py`** - Handles data harvesting from the Kaggle dataset. This module: +- Downloads the IMDb reviews dataset using `kagglehub` library +- Loads the CSV file using pandas (justification below) +- Caches results using pickle to avoid re-downloading on subsequent runs +- Distributes reviews across requested movie titles + +**`text_utils.py`** - Core text processing utilities implementing all required analysis steps: +- `clean_text()`: Removes HTML tags, decodes entities, lowercases, strips punctuation +- `remove_stopwords()`: Filters out common stop words using a custom list +- `word_frequencies()`: Uses Python's `Counter` to compute word frequencies +- `summary_stats()`: Calculates vocabulary metrics (token count, vocab size, avg word length, type-token ratio) +- `ascii_bar_chart()`: Generates ASCII-based visualizations without matplotlib + +**`analyze_reviews.py`** - Orchestrates the analysis pipeline for a single movie title: +- Applies cleaning, tokenization, and stopword removal +- Computes frequencies and statistics +- Generates ASCII visualizations +- Optionally applies VADER sentiment analysis (toggleable via `USE_SENTIMENT` flag) + +### Design Decisions + +**Pandas Usage Justification**: Pandas was used only for reading the Kaggle CSV dataset, which required DataFrame parsing for initial text access. The `kagglehub` library's `KaggleDatasetAdapter.PANDAS` interface requires pandas to load the dataset. All subsequent analysis steps use standard Python libraries (`collections.Counter`, built-in string methods, etc.) rather than pandas operations. This minimal use of pandas is justified because manually parsing a 25MB CSV file with 50,000 rows would be significantly more complex and error-prone than using pandas' robust CSV reader. + +**Caching Strategy**: The project uses pickle to cache fetched reviews, avoiding repeated API calls or dataset downloads. This improves performance and reduces load on external services. + +**ASCII Visualization**: Instead of matplotlib (which the assignment recommends avoiding), the project uses ASCII bar charts printed to the console. This approach is lightweight, requires no external dependencies, and produces readable visualizations directly in terminal output. + +### AI Tool Assistance + +ChatGPT was used primarily for tedious tasks and documentation rather than core code generation: +- **Documentation**: Generated docstrings and comments explaining what each code section does (tedious but necessary for clarity) +- **Stopword list**: Compiled the comprehensive stopword list used in `text_utils.py` (time-consuming task of listing common English stopwords) +- **Troubleshooting**: When Cinemagoer's review endpoint failed, ChatGPT helped diagnose the problem and suggested alternative data sources (Kaggle dataset) +- **Reference lookups**: Looked up specific API usage (e.g., "How to use VADER sentiment analyzer from NLTK") and encoding solutions (e.g., "How to handle latin-1 encoding issues when reading CSV files with pandas") + +All code logic, algorithms, and architecture decisions were implemented manually. ChatGPT was used as a research and documentation tool to speed up tedious tasks like writing comments and compiling reference lists. All AI-assisted sections are marked with comments indicating the specific use of ChatGPT. + +## 3. Results + +The analyzer successfully processes movie reviews and generates meaningful insights. Here are sample outputs from analyzing three popular films: + +### Sample Output: The Dark Knight + +``` +=== The Dark Knight === +num_tokens: 6463 +vocab_size: 3005 +avg_word_len: 5.847 +type_token_ratio: 0.465 + +Top 20 words: + one | ################################################ 49 + all | ############################################ 45 + like | ######################################## 41 + him | ################################ 33 + no | ############################# 30 + time | ############################ 29 + only | ########################## 27 + well | ######################## 25 +``` + +### Sample Output: Barbie + +``` +=== Barbie === +num_tokens: 6288 +vocab_size: 2849 +avg_word_len: 5.788 +type_token_ratio: 0.4531 + +Top 20 words: + all | ################################################ 52 + good | ####################################### 43 + one | ###################################### 42 + like | #################################### 39 + very | ############################# 32 +``` + +### Findings + +**Word Frequency Patterns**: Across all analyzed movies, common words like "all", "one", "like", "good", "very", "time", and "story" appear frequently. These are content words (not stopwords) that reflect common review language patterns. The presence of "good" and "like" suggests positive sentiment vocabulary, while "story", "characters", and "time" indicate reviewers focus on narrative elements. + +**Vocabulary Richness**: The type-token ratio (TTR) ranges from 0.41 to 0.47, indicating moderate vocabulary diversity. Higher TTR suggests more varied word choice, while lower TTR suggests repetitive language. Movies show similar TTR values, suggesting consistent review writing styles across the dataset. + +**Average Word Length**: Words average 5.7-5.8 characters, which is typical for English text. This metric helps validate that the text cleaning process is working correctly (not removing too much content). + +**Sentiment Analysis**: When enabled (by setting `USE_SENTIMENT = True` in `analyze_reviews.py`), VADER sentiment analysis provides compound sentiment scores ranging from -1 (most negative) to +1 (most positive). The optional sentiment feature allows for deeper emotional analysis of review text. + +The project also generates a `review_wordfreqs.tsv` file that can be opened in spreadsheet software for further analysis or visualization. + +### Sample Terminal Output + +The analyzer produces formatted ASCII bar charts directly in the terminal. A full sample output is saved in `images/output.txt` showing the complete analysis results for all three movies. + +``` +=== The Dark Knight === +num_tokens: 6463 +vocab_size: 3005 +avg_word_len: 5.847 +type_token_ratio: 0.465 + +Top 20 words: + one | ################################################ 49 + all | ############################################ 45 + like | ######################################## 41 + him | ################################ 33 + no | ############################# 30 + time | ############################ 29 +``` + +This visualization approach allows for immediate viewing of results without requiring external plotting libraries. + +## 4. Reflection + +### Process Reflection + +**What Went Well**: The modular architecture made development straightforward. Breaking the project into `reviews_fetcher.py`, `text_utils.py`, `analyze_reviews.py`, and `main.py` allowed for independent testing and iteration. The switch to Kaggle dataset was smooth once the encoding issues were resolved. The ASCII visualization approach proved effective for terminal-based output without requiring matplotlib. + +**Biggest Challenge**: The primary challenge was Cinemagoer's nonfunctional review endpoint. Initial attempts to fetch reviews directly from IMDb failed, requiring research into alternative data sources. The Kaggle dataset alternative introduced new challenges: +- Encoding issues: The CSV file used `latin-1` encoding, which required specific pandas configuration +- Column parsing: Initial attempts to load the dataset resulted in corrupted column names, requiring explicit column renaming +- File size: The 25MB dataset required careful handling of memory and download time + +**How It Was Solved**: ChatGPT was helpful for troubleshooting and research. When Cinemagoer failed, I asked ChatGPT for alternative IMDb data sources and received suggestions for Kaggle datasets. For encoding problems, ChatGPT provided guidance on pandas `read_csv` parameters, leading to the `encoding='latin-1'` and `engine='python'` solution. + +**Testing Plan**: Testing was done incrementally: +1. Tested data fetching with a single movie title +2. Verified text cleaning produced expected output (no HTML tags, proper lowercasing) +3. Validated word frequencies matched manual counts for small samples +4. Confirmed ASCII charts displayed correctly +5. Tested caching to ensure subsequent runs used cached data + +**Project Scope**: The project scope was appropriately sized. It met all required steps while including one optional technique (sentiment analysis). The modular design makes it easy to extend with additional features (e.g., text similarity, clustering) without major refactoring. + +### Learning Reflection + +**Biggest Takeaway**: The most valuable learning was understanding how to work with real-world data sources that have limitations. Encountering Cinemagoer's broken API taught me to: +- Research alternative data sources when primary ones fail +- Handle encoding issues that are common in web-scraped data +- Use caching effectively to avoid repeated downloads +- Adapt project scope when technical constraints emerge + +**AI Tools' Role**: ChatGPT was most useful for tedious tasks and quick reference lookups: +- **Stopword compilation**: Asked ChatGPT to generate a comprehensive list of common English stopwords rather than manually typing them all +- **Documentation**: Used ChatGPT to write clear docstrings and comments explaining code sections (tedious but necessary) +- **Quick reference**: When I needed to look up specific API usage (like VADER sentiment analyzer), ChatGPT provided quick answers instead of digging through documentation +- **Troubleshooting**: When encoding errors occurred, ChatGPT suggested solutions to try (like using `latin-1` encoding) + +The AI assistance was most effective for tasks that were time-consuming but straightforward (like compiling stopwords) or for quick reference lookups. All code logic and architecture decisions were implemented manually. + +**Future Applications**: The skills learned here apply to any text analysis project: +- Working with APIs and datasets +- Text preprocessing pipelines +- Statistical analysis of language patterns +- Building modular, maintainable code + +**What I Wish I Knew**: +- That Cinemagoer's review endpoint was broken—would have saved time by starting with Kaggle dataset +- More about encoding issues in CSV files—would have anticipated latin-1 encoding challenges +- The importance of caching early—would have implemented caching from the start to speed up iteration + +### Future Improvements + +Potential enhancements for this project: +1. **Text Similarity**: Add cosine similarity calculation to compare vocabulary across different movies +2. **Sentiment Distribution**: When sentiment is enabled, create visualizations showing sentiment distribution (negative/neutral/positive buckets) +3. **Comparative Analysis**: Generate side-by-side comparisons of multiple movies' word frequencies +4. **Interactive Dashboard**: Create a simple web interface using Flask to explore results interactively +5. **Movie-Specific Filtering**: Improve the Kaggle dataset usage to actually filter reviews by movie title (currently samples randomly) +6. **Time-Based Analysis**: If review dates become available, analyze sentiment trends over time + +The modular architecture makes these improvements straightforward to implement without major refactoring. + +# text-analysis + \ No newline at end of file diff --git a/analyze_reviews.py b/analyze_reviews.py new file mode 100644 index 0000000..5aa6bee --- /dev/null +++ b/analyze_reviews.py @@ -0,0 +1,66 @@ +# analyze_reviews.py +# Required steps: clean -> stopwords -> freqs -> stats -> ASCII chart. +# Optional: VADER sentiment (toggle with USE_SENTIMENT = True) +# +# AI Assistance (ChatGPT, November 2025): +# - Used for documentation: Writing docstrings and comments explaining what each code section does +# - Used for reference: "How to use VADER sentiment analyzer from NLTK in Python" (looked up API usage) + +from typing import List, Dict +from collections import Counter +from text_utils import clean_text, tokenize, remove_stopwords, word_frequencies, summary_stats, ascii_bar_chart + +# Optional sentiment block (safe if nltk isn't installed as long as USE_SENTIMENT=False) +USE_SENTIMENT = False +_sia = None + +def _init_sentiment(): + global _sia + if _sia is None: + import nltk + nltk.download('vader_lexicon', quiet=True) + from nltk.sentiment.vader import SentimentIntensityAnalyzer + _sia = SentimentIntensityAnalyzer() + +def analyze_title_reviews(title: str, reviews: List[Dict], top_n: int = 20): + """ + reviews: list of dicts with 'content' + Prints stats & top words, returns a dict of results you can save if needed. + """ + tokens = [] + sentiments = [] + + for r in reviews: + text = clean_text(r.get("content", "")) + toks = tokenize(text) + toks = remove_stopwords(toks) + tokens.extend(toks) + + if USE_SENTIMENT: + if _sia is None: + _init_sentiment() + score = _sia.polarity_scores(text)["compound"] + sentiments.append(score) + + freqs: Counter = word_frequencies(tokens) + stats = summary_stats(tokens) + top = freqs.most_common(top_n) + + print(f"\n=== {title} ===") + for k, v in stats.items(): + print(f"{k}: {v}") + print(f"\nTop {top_n} words:") + print(ascii_bar_chart(top)) + + if USE_SENTIMENT and sentiments: + avg = sum(sentiments)/len(sentiments) + print(f"\nApprox. average VADER compound sentiment: {avg:.3f}") + + return { + "title": title, + "stats": stats, + "top_words": top, + "num_reviews": len(reviews), + "avg_sentiment": (sum(sentiments)/len(sentiments)) if (USE_SENTIMENT and sentiments) else None + } + diff --git a/images/text_clustering.png b/images/text_clustering.png deleted file mode 100644 index 49fb044..0000000 Binary files a/images/text_clustering.png and /dev/null differ diff --git a/instructions.md b/instructions.md deleted file mode 100644 index 1356910..0000000 --- a/instructions.md +++ /dev/null @@ -1,597 +0,0 @@ -# Text Analysis Project - -## Introduction - -In this project, you will learn how to use computational techniques to analyze text. You will access text from a variety of sources, including websites and APIs, and run computational analyses to create some sort of deliverable, such as interesting results from a text analysis, a visualization, or even a Python program that manipulates language in some interesting way. As part of the project, you are encouraged to use AI tools to explore how to talk to APIs and how to use Python libraries that have not been covered in class yet. This assignment is an **individual project**. - -**Skills Emphasized**: - -- Accessing data programmatically from various sources on the Internet -- Parsing text and storing it in appropriate data structures -- Selecting the most suitable data structures for a specific task (e.g. dictionaries versus lists) -- Applying computational methods to analyze, characterize, and compare text -- Experimenting with AI tools to enhance the learning process and explore new tools and techniques. - ---- - -## How to Proceed - -To get started on the assignment, you should first **fork** this base repository. Once you've forked the repository, **clone** the **forked** repository (the one under your GitHub profile) to your computer. You need to create one or multiple `.py` files in the **forked** repository. - -You should read this document in a somewhat non-linear/spiral fashion: - -1. Scan through **Part 1** to get a sense of what data sources are available. You can select one or two sources that interests you and try to retrieve text from them. Note that you do not need to try all the data sources. -2. Scan through **Part 2** to see a bunch of cool examples for what you can do with your text. You can also ask AI tools what else you can do with Python to process, analyze or visulize the text. -3. Choose (at least) one data source from **Part 1** and apply required techniques from **Part 2**, plus any additional techniques that interest you, to analyze, manipulate, transform or visualize the text. -4. Make sure there is one clear entry `.py` file for the entire project. Multiple `.py` files are encouraged to break the project into smaller, modular components. -5. Use the `if __name__ == "__main__"` idiom in the `.py` files. Your code should be executed when the entry Python file is run. - ```python - if __name__ == "__main__": - main() - ``` -6. You are required to experiment with learning from AI tools (see more in **Part 3**). -7. Write a brief document (**Part 4**) describing your process and your reflection. -8. If you use any code or solutions that is not written by you (or that you learned from other places such as StackOverFlow/GitHub), please add Python comments (before the block of code) describing where you got/learned it from. -9. Generally I **DO NOT** recommend using `numpy`, `pandas`, `sklearn` or `matplotlib` in this project, unless there is no other alternative way of processing and analyazing your data. For instance, if you need to perform complex matrix computations, text clustering (like MDS), or advanced visualizations, it is acceptable to use these libraries. Please justify your choice in your project documentation. - -### Jupyter Notebook vs .py Files: Which to Use? - -**Required for Submission**: Your final project **must** include `.py` files as described above. This is a mandatory requirement. - -**Optional for Development**: You are encouraged to use Jupyter Notebooks (`.ipynb` files) during the development and exploration phase. Here's a recommended workflow: - -**Use Jupyter Notebooks for:** - -- **Exploratory Data Analysis**: Quickly test API connections, explore text data characteristics, and experiment with different processing methods -- **Interactive Visualization**: Adjust chart parameters in real-time and immediately see the results -- **Learning and Experimentation**: Test new libraries (NLTK, TextBlob, etc.) and debug code step-by-step -- **Documentation**: Keep notes and screenshots showing how you used AI tools to learn and solve problems - -**Use .py Files for:** - -- **Final Submission**: This is required by the project specifications -- **Production Code**: Well-organized, modular functions with proper error handling -- **Code Reusability**: Functions and classes that can be imported and reused -- **Version Control**: Git-friendly format for tracking changes - -**Workflow Recommendation:** - -1. **Explore in Jupyter**: Test APIs, experiment with text processing techniques, create visualizations interactively -2. **Refactor to .py**: Move successful code into well-organized Python modules with proper functions and documentation -3. **Submit Both** (optional): Include both `.ipynb` files (to show your learning process) and `.py` files (required for grading). You can reference your notebooks in the README to demonstrate how you used AI tools for learning. - -This approach allows you to leverage the interactive benefits of Jupyter Notebooks while meeting the project's requirements for well-structured Python code. - ---- - -## Part 1: Harvesting text from the Internet - -The goal for Part 1 is to collect some text from the Internet that you can later use for text analysis. Before diving deep into any particular method of text acquisition, it is recommended that you explore the different APIs and Python libraries available to extract text from the web. However, before spending too much time going down a particular path on the text acquisition component, you should look ahead to Part 2 to understand some of the things you can do with text you are harvesting. The key to a successful project is combining a relevant source of text with an appropriate technique for analysis (see Part 2). - -**Note**: Some APIs (such as Twitter and Reddit) may require a paid subscription or a lengthy application process. It is recommended to apply for API credentials in advance or choose alternative free data sources to avoid delays later in the project. - -### Installing Python Packages - -Throughout this project, you will need to install various Python libraries. Here are the recommended methods: - -**If you are using Anaconda** (recommended for this course): - -```shell -# Use conda to install packages (preferred method for Anaconda users) -conda install -c conda-forge package_name - -# If the package is not available in conda, use pip with Anaconda's Python -python -m pip install package_name -``` - -**If you are using standard Python installation** (not Anaconda, not for this course): - -```shell -# For Windows users -python -m pip install package_name - -# For macOS/Linux users -python3 -m pip install package_name -``` - -**Important Notes**: - -- Always use `python -m pip install` instead of just `pip install` to ensure you're installing to the correct Python environment -- If you're using Anaconda, try `conda install` first, as it handles dependencies better -- You can check which Python you're using by running `python --version` or `which python` (macOS/Linux) or `where python` (Windows) - -### Data Source: Project Gutenberg - -Project Gutenberg () is a website that provides over 55,000 e-books that are freely available to the public. Unlike some sites, all of the texts on Project Gutenberg are in the public domain, which means they are no longer protected by copyright. For example, the site offers 171 works by Charles Dickens. The best thing about these texts is that they are available in plain text format, which makes them easy to analyze using Python. - -To download a book from Project Gutenberg, first use the search engine on the Project Gutenberg website to find a book you are interested in downloading. For example, if you want to download *Oliver Twist* by Charles Dickens, search for it on the website. Once you have found the book you want to download, go to its page on the Project Gutenberg website. Find the "Plain Text UTF-8" link on the book's page. Copy the link to the plain text version of the book. In the case of *Oliver Twist*, the link to the plain text version is `"https://www.gutenberg.org/cache/epub/730/pg730.txt"`. - -To download the text inside Python, you can use the following code: - -```python -import urllib.request - -url = 'https://www.gutenberg.org/cache/epub/730/pg730.txt' -try: - with urllib.request.urlopen(url) as f: - text = f.read().decode('utf-8') - print(text) # for testing -except Exception as e: - print("An error occurred:", e) -``` - -**Security Note**: When working with APIs that require credentials (Twitter, Reddit, News API, etc.), never commit your API keys to version control. Use environment variables or separate configuration files (e.g., `config.py`) and add them to `.gitignore`. - -Note that there is a preamble (boilerplate on Project Gutenberg, table of contents, etc.) that has been added to the text that you might want to strip out using Python code when you do your analysis. There is similar material at the end of the file. - -One limitation of using Project Gutenberg is that they impose a limit on how many texts you can download in a 24-hour period. If you are analyzing many texts, it may be more efficient to download them once and load them off disk, rather than fetching them from Project Gutenberg's servers every time you run your program. See the **Pickling Data** section below on how to save data to files and load it back into your program. Additionally, there are many mirrors of the Project Gutenberg site available if you want to get around the download restriction. - -### Data Source: Wikipedia - -Wikipedia is another valuable source of data that can be easily accessed and parsed using the [mediawiki library](https://github.com/barrust/mediawiki) which is a python wrapper and parser for the **MediaWiki API**. To install the library: - -```shell -# If using Anaconda (recommended) -conda install -c conda-forge pymediawiki - -# Or using pip -python -m pip install pymediawiki -``` - -Once you have installed the library, you can use it to search Wikipedia, get article summaries, and extract data like links and images from a page. To fetch a particular article and print out its sections, you can use the following Python code: - -```python -from mediawiki import MediaWiki - -wikipedia = MediaWiki() -babson = wikipedia.page("Babson College") -print(babson.title) -print(babson.content) -``` - -This code will fetch the article with the given title and print its title and content. The output will look like this: - -```txt -Babson College (Babson) is a private business school in Wellesley, Massachusetts. Established in 1919, Babson's central focus is on entrepreneurship education and its use in creating economic and social value. The college was founded by Roger W. Babson as an all-male business institute and became coeducational in 1970. -... -``` - -You can also access other properties of a page, such as its categories, sections, and links. See the mediawiki package [documentation](https://pymediawiki.readthedocs.io/en/latest/quickstart.html#other-properties) for more information on available properties and methods. - -### Data Source: Twitter - -(**Note**: I have not tested this API since the announcment of shutting down free Twitter API. The free version of Twitter API has been deprecated and replaced with a new version that requires application approval and authentication with a paid subscription. To use the Twitter API, you need to apply to Twitter for a developer account and explain the purpose of what you are doing with the data, which Twitter will manually review.) - -(**Update**: You'll need at least the Basic access tier to search recent tweets, which isn't free. You can subscribe to it in your [Dashboard](https://developer.twitter.com/en/portal/dashboard) in the Developer Portal.) - -If you have access to a Twitter developer account and the necessary API keys and tokens, you can use the `tweepy` library to search for tweets. - -To install tweepy: - -```shell -# If using Anaconda (recommended) -conda install -c conda-forge tweepy - -# Or using pip -python -m pip install tweepy -``` - -Here is a simple example for searching tweets containing `Babson College`: - -```python -import tweepy - -# Replace the following strings with your own keys and secrets -TOKEN = 'Your TOKEN' -TOKEN_SECRET = 'Your TOKEN_SECRET' -CONSUMER_KEY = 'Your CONSUMER_KEY' -CONSUMER_SECRET = 'Your CONSUMER_SECRET' - -# Authenticate to Twitter -auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) -auth.set_access_token(TOKEN,TOKEN_SECRET) - -api = tweepy.API(auth) - -for tweet in api.search_tweets(q="babson college", lang="en", count=10): - print(f"{tweet.user.name}: {tweet.text}") -``` - -### Data Source: Reddit - -Note: Reddit also requires users to register and create an application in order to obtain API credentials. After creating an application, you can obtain the necessary credentials such as `client_id`, `client_secret`, `username`, `password`, and `user_agent`. You can learn more about this process on the [Reddit API documentation](https://www.reddit.com/dev/api/). - -To get reddit data, you need to install the [PRAW library](https://github.com/praw-dev/praw): - -```shell -# If using Anaconda (recommended) -conda install -c conda-forge praw - -# Or using pip -python -m pip install praw -``` - -Here's an example from the [PRAW docs page](https://praw.readthedocs.io/en/stable/getting_started/quick_start.html): - -```python -import praw -import config - - -reddit = praw.Reddit(client_id=config.client_id, - client_secret=config.client_secret, - username=config.username, - password=config.password, - user_agent=config.user_agent) - -sub = 'learnpython' -submissions = reddit.subreddit(sub).top('day', limit=5) -for submission in submissions: - print(submission.title) - print(submission.selftext) -``` - -### Data Source: News API - -You can use `newsapi-python` library to fetch news articles from [News API](https://newsapi.org/docs/). You need to install the [newsapi-python library](https://github.com/mattlisiv/newsapi-python): - -```shell -# If using Anaconda (recommended) -conda install -c conda-forge newsapi-python - -# Or using pip -python -m pip install newsapi-python -``` - -Here's an example from the [Python client library page](https://newsapi.org/docs/client-libraries/python) in News API Documentation: - -```python -from newsapi import NewsApiClient - -# Init -newsapi = NewsApiClient(api_key='API_KEY') - -# /v2/top-headlines -top_headlines = newsapi.get_top_headlines(q='bitcoin', - sources='bbc-news,the-verge', - category='business', - language='en', - country='us') - -# /v2/everything -all_articles = newsapi.get_everything(q='bitcoin', - sources='bbc-news,the-verge', - domains='bbc.co.uk,techcrunch.com', - from_param='2017-12-01', - to='2017-12-12', - language='en', - sort_by='relevancy', - page=2) - -# /v2/top-headlines/sources -sources = newsapi.get_sources() -``` - -### Data Source: Newspaper Articles - -You can also use `Newspaper4k` package to scrape and curate news articles. You need to install the [Newspaper4k library](https://github.com/AndyTheFactory/newspaper4k): - -```shell -# If using Anaconda (recommended) -conda install -c conda-forge newspaper4k - -# Or using pip -python -m pip install newspaper4k -``` - -Here's an example from the [Newspaper4k Docs page](https://newspaper4k.readthedocs.io/en/latest/): - -```python -import newspaper - -article = newspaper.article('https://edition.cnn.com/2023/10/29/sport/nfl-week-8-how-to-watch-spt-intl/index.html') - -print(article.authors) -# ['Hannah Brewitt', 'Minute Read', 'Published', 'Am Edt', 'Sun October'] - -print(article.publish_date) -# 2023-10-29 09:00:15.717000+00:00 - -print(article.text) -# New England Patriots head coach Bill Belichick, right, embraces Buffalo Bills head coach Sean McDermott ... -``` - -### Data Source: IMDB Movie Reviews - -To get the IMDB data, you need to install [`cinemagoer` library](https://github.com/cinemagoer/cinemagoer): - -```shell -# If using Anaconda (recommended) -conda install -c conda-forge cinemagoer - -# Or using pip -python -m pip install cinemagoer -``` - -Here's an example to print the first review of the movie "The Dark Knight": -```python -from imdb import Cinemagoer - -# create an instance of the Cinemagoer class -ia = Cinemagoer() - -# search movie -movie = ia.search_movie("The Dark Knight")[0] -print(movie.movieID) -# '0468569' - -# Get reviews -movie = ia.get_movie('0468569', info=['reviews']) # Make sure to add the second argument -reviews = movie.get('reviews', []) - -for review in reviews: - print(review['content']) - print() - - -# Get actor -matt_damon = ia.get_person_filmography('0000354') - -# Get Matt Damon's movies -data = matt_damon['data'] -filmography = data['filmography'] -films_as_actor = filmography['actor'] -print(films_as_actor) -``` - -### Data Source: More Data Sources - -There are many other data sources that you can utilize in your project: - -- [Hugging Face Hub](https://huggingface.co/datasets) - - [Tutorial](https://huggingface.co/docs/datasets/en/load_hub) on How to Load a dataset from the Hub -- [Kaggle datasets](https://www.kaggle.com/datasets), which includes a variety of text datasets, such as news articles and movie reviews. -- [Yelp dataset](https://www.yelp.com/dataset) -- [SMS Spam Collection](https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset) is composed by 5,574 English, real and non-enconded messages, tagged according being legitimate (ham) or spam. -- [Enron email dataset](https://www.cs.cmu.edu/~./enron/) -- [News articles](https://archive.ics.uci.edu/dataset/137/reuters+21578+text+categorization+collection) in UCI Machine Learning Repository -- [Awesome Public Datasets](https://github.com/awesomedata/awesome-public-datasets) -- [Amazon AWS Registry of Open Data](https://registry.opendata.aws/), which includes several text datasets, such as Wikipedia and Common Crawl. -- ... - -Feel free to explore and choose the data source that fits your project's needs. - -### Pickling Data - -When you download text data from the Internet, it is often useful to save it to disk so that you do not have to re-download it every time you run your program. One way to do this in Python is to use the built-in `pickle` library, which allows you to serialize Python objects and save them to a file. - -In addition to pickling, you can also save files using JSON format. To explore more about the built-in `json` library, feel free to ask AI tools or visit the official Python documentation website. - ---- - -## Part 2: Analyzing Your Text - -This part consists of **required steps** that all students must complete, and **optional techniques** that you can choose from to extend your analysis. - -### Required Steps - -All students must complete the following steps: - -#### 1. Text Cleaning and Preprocessing - -(**Note**: This step is required.) - -Before analyzing your text, you need to clean and preprocess it. This includes: - -- Removing unwanted content (e.g., Project Gutenberg preambles, HTML tags, special characters) -- Converting text to lowercase for consistency -- Handling punctuation appropriately -- Dealing with encoding issues if any - -Real-world text data is often messy, and proper cleaning is essential for accurate analysis. - -#### 2. Removing Stop Words - -(**Note**: This step is required.) - -Stop words are words that occur frequently in text but do not provide useful information for analysis. Examples of stop words include "the", "and", "a", "is", etc. Removing stop words helps to: - -- Reduce the size of the text data -- Improve the accuracy of analysis -- Focus on meaningful words that carry semantic value - -You can use built-in stop word lists from libraries like NLTK, or create [your own custom list](https://github.com/OIM3640/resources/blob/main/code/data/stopwords.txt). - -#### 3. Word Frequency Analysis - -(**Note**: This step is required.) - -One way to begin to process your text is to take each unit of text (for instance, books from Project Gutenberg, or perhaps a collection of movie reviews) and summarize it by counting the number of times a particular word appears in the text. A natural way to approach this in Python would be to use a **dictionary** where the keys are words that appear and the values are frequencies of words in the text. If you want to do something fancier, you can use [TF-IDF features](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). - -#### 4. Computing Summary Statistics - -(**Note**: This step is required.) - -Beyond calculating word frequencies, there are other methods to summarize text. For instance, you may want to: - -- Identify the top 10 or top 20 most frequent words in each text -- Determine words that appear frequently in one text but not in others -- Calculate average word length, sentence length, or document length -- Compare vocabulary richness across different texts - -We practiced this using Jane Austen's novel in class, so I recommend starting with that example. - -#### 5. Data Visualization - -(**Note**: This step is required.) - -You must create at least one visualization to present your analysis results. This could include: - -- Bar charts showing top N most frequent words -- Word clouds (can use the `wordcloud` library) -- Line graphs comparing statistics across different texts -- Simple ASCII-based visualizations if you want to avoid matplotlib - -Visualizations help communicate your findings effectively and make your analysis more engaging. - -### Optional Techniques - -Choose **at least one** of the following advanced techniques to extend your analysis: - -### Optional Technique 1: Natural Language Processing - -(**Note**: Choose at least one optional technique.) - -[NLTK](https://www.nltk.org/) - the Natural Language Toolkit - is a powerful tool for processing human language data. It provides a wide range of capabilities, such as part-of-speech tagging, sentiment analysis, and full sentence parsing. - -To use NLTK, you need to install `nltk`: - -```shell -# If using Anaconda (recommended) -conda install -c conda-forge nltk - -# Or using pip -python -m pip install nltk -``` - -Here is an example of doing [sentiment analysis](https://en.wikipedia.org/wiki/Sentiment_analysis) using the `VADER` library in NLTK: - -```python -import nltk -nltk.download('vader_lexicon') # Download required data -from nltk.sentiment.vader import SentimentIntensityAnalyzer - -sentence = 'Software Design is my favorite class because learning Python is so cool!' -score = SentimentIntensityAnalyzer().polarity_scores(sentence) -print(score) -# Output -# {'neg': 0.0, 'neu': 0.614, 'pos': 0.386, 'compound': 0.7417} -``` - -Notice: If you receive `Resource vader_lexicon not found` error when using `nltk`, you need to enter `python` in **Command Prompt** (or `python3` in **Terminal** on macOS), then enter `import nltk` and `nltk.download('vader_lexicon')` in Python interactive shell. - -You can also use [TextBlob](https://github.com/sloria/TextBlob) library, which is built on top of NLTK, for almost everything that NLTK does. Below is the brief introduction of TextBlob from its GitHub page: - -> TextBlob is a Python library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, and more. - -If you perform natural language processing, you can draw interesting insights from text data collected from the web. For instance, if you monitor a specific subreddit related to a political topic, you can gauge the sentiment of the community by analyzing the text of each post and comment. Similarly, you can analyze discussions on subreddits dedicated to movies to identify which recent movies have received the most negative reviews. There are tons of cool options here! - -### Optional Technique 2: Text Similarity - -(**Note**: Choose at least one optional technique.) - -It is potentially quite useful to be able to compute the similarity of two texts. Suppose that we have characterized some texts from Project Gutenberg using word frequency analysis. One way to compute the similarity of two texts is to test to what extent when one text has a high count for a particular word the other text also a high count for a particular word. Specifically, we can compute the cosine similarity between the two texts. This strategy involves thinking of the word counts for each text as being high-dimensional vectors where the number of dimensions is equal to the total number of unique words in your text dataset and the entry in a particular element of the vector is the count of how frequently the corresponding word appears in a specific document. If you find this approach unclear and wish to try it, you can either reach out to the professor, or ask AI tools for assistance. - -For a simple text similarity task, you can use external libraries, like [`TheFuzz` library](https://github.com/seatgeek/thefuzz), which uses [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) to calculate the differences between sequences. - -```python -from thefuzz import fuzz - -print(fuzz.ratio("this is a test", "this is a test!")) # 97 -print(fuzz.partial_ratio("this is a test", "this is a test!")) # 100 -print(fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) # 91 -print(fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) # 100 -``` - -### Optional Technique 3: Text Clustering - -(**Note**: Choose at least one optional technique.) - -If you can generate pairwise similarities (say using the technique above), you can use Metric Multi-dimensional Scaling (MDS) to visualize the texts in a 2-dimensional space. This can help identify clusters of similar texts. - -In order to apply MDS to your data, you can use the machine learning toolkit `scikit-learn`. Here is some code that uses the similarity matrix defined in the previous section to create a 2-dimensional embedding of the four *Charles Dickens* and 1 *Charles Darwin* texts. - -```python -import numpy as np -from sklearn.manifold import MDS -import matplotlib.pyplot as plt - -# these are the similarities computed from the previous section -S = np.asarray([[1., 0.90850572, 0.96451312, 0.97905034, 0.78340575], - [0.90850572, 1., 0.95769915, 0.95030073, 0.87322494], - [0.96451312, 0.95769915, 1., 0.98230284, 0.83381607], - [0.97905034, 0.95030073, 0.98230284, 1., 0.82953109], - [0.78340575, 0.87322494, 0.83381607, 0.82953109, 1.]]) - -# dissimilarity is 1 minus similarity -dissimilarities = 1 - S - -# compute the embedding -coord = MDS(dissimilarity='precomputed').fit_transform(dissimilarities) - -plt.scatter(coord[:, 0], coord[:, 1]) - -# Label the points -for i in range(coord.shape[0]): - plt.annotate(str(i), (coord[i, :])) - -plt.show() -``` - -This will generate the following plot. The coordinates don't have any special meaning, but the embedding tries to maintain the similarity relationships that we computed via comparing word frequencies. Keep in mind that the point labeled 4 is the work by *Charles Darwin* and the other are by *Charles Dickens*. -text clustering - -### Optional Technique 4: Markov Text Synthesis - -(**Note**: Choose at least one optional technique.) - -You can use Markov analysis to learn a generative model of the text that you collect from the web and use it to generate new texts. You can even use it to create mashups of multiple texts. One of possibilities in this space would be to create literary mashups automatically. Again, let professor know if you go this route and we can provide more guidance. - -### Optional Technique 5: LLM (Large Language Model) Text Generation - -(**Note**: Choose at least one optional technique.) - -You can explore further possibilities by using the [OpenAI API](https://platform.openai.com/docs/overview). Feel free to ask for an API token if you're interested, and I'd be happy to provide it. I highly encourage you to give this a try! - ---- - -## Part 3: Learning with AI - -As you work through this project and experiment with different libraries in Python, you may encounter roadblocks or have questions about your code. That's when you can use AI tools, like ChatGPT to clear out any issues. You are also encouraged to learn other approaches, besides the techniques mentioned above, to process, analyze and visualize your own text dataset in Python from ChatGPT or other AI tools, who will serve as your assistant, providing helpful suggestions, aiding your learning process. - -**Reminder**: While AI tools can be incredibly helpful in resolving issues or suggesting new approaches, it’s important not to rely too heavily on them. Always test and validate the generated code, making sure it meets the project requirements and that you fully understand how the code works. Include comments in your code that indicate which parts were generated with AI assistance, and provide links or references to the sources if applicable. This practice not only helps maintain academic integrity but also demonstrates your learning process. - -Here's how to make the most out of AI tools (using ChatGPT as an example): - -- **Clearly Define Your Problem**: Take detailed notes on where you're stuck or what you're trying to achieve before asking ChatGPT for assistance. -- **Craft Detailed Prompts**: When asking ChatGPT for help, provide a clear and thorough description of the issue. The better you frame your question, the more helpful the response will be. -- **Review and Verify**: After receiving a response, carefully read the suggestions. Remember, AI-generated solutions may not always be accurate, so it's important to test the code and consult additional official documentation if needed. -- **Document Your Learning Process**: To track your progress, include ChatGPT Shared Links in your code comments or maintain a separate document. You may also take screenshots during your ChatGPT session and include them in your project write-up. - ---- - -## Part 4: Project Writeup and Reflection - -Write a summary of your project and your reflections on it in [`README.md`](README.md), using [Markdown format](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax). There is no need to use fancy words or ChatGPT. The [`README.md`](README.md) file should consist of the following sections: - -**1. Project Overview** (~1 paragraph) - -What data source(s) did you use? What technique(s) did you use to process or analyze them? What did you hope to create or learn through this project? - -**2. Implementation** (~1-2 paragraphs) - -Describe your implementation at a system architecture level. You should NOT walk through your code line by line, or explain every function (we can get that from your docstrings). Instead, talk about the major components, algorithms, data structures and how they fit together. You should also discuss at least one design decision where you had to choose between multiple alternatives, and explain why you made the choice. Use shared links and/or screenshots to describe how you used AI tools to help you or learn new things. - -**3. Results** (~1-3 paragraphs + figures/examples) - -Present what you accomplished in your project: - -- If you did some text analysis, what interesting things did you find? Graphs or other visualizations may be very useful here for showing your results. -- If you created a program that does something interesting (e.g. a Markov text synthesizer), be sure to provide a few interesting examples of the program's output. - -**4. Reflection** (~1-2 paragraphs) - -From a process point of view, what went well? What was the biggest challenge? How did you solve it? What could you improve? Was your project appropriately scoped? Did you have a good testing plan? - -From a learning perspective, what was your biggest takeaway from this project? How did AI tools help you? How will you use what you learned going forward? What do you wish you knew beforehand that would have helped you succeed? - ---- - -## Submitting your Project - -1. Push all the code and updated `README.md` to the GitGub repository. -2. Create a pull request to the upstream repository. Please learn how to create a pull request by following [this instruction](https://docs.github.com/en/desktop/working-with-your-remote-repository-on-github-or-github-enterprise/creating-an-issue-or-pull-request-from-github-desktop#creating-a-pull-request). -3. Submit your project's GitHub repository URL to Canvas. - ---- -*Updated*: *2025/10/26* diff --git a/main.py b/main.py new file mode 100644 index 0000000..11641b5 --- /dev/null +++ b/main.py @@ -0,0 +1,31 @@ +# main.py +# Entry file for the project: runs fetch -> analysis for chosen titles. +# +# AI Assistance (ChatGPT, November 2025): +# - Used for documentation: Writing comments explaining the main entry point and code organization + +from reviews_fetcher import get_or_build_cache +from analyze_reviews import analyze_title_reviews + +def main(): + # Pick any films you like—franchise comparisons are fun! + # Note: Using Kaggle dataset due to Cinemagoer review limitations + titles = ["The Dark Knight", "Barbie", "Oppenheimer"] + + data = get_or_build_cache(titles) + + results = [] + for t in titles: + results.append(analyze_title_reviews(t, data.get(t, []), top_n=20)) + + # Optionally write TSV to inspect in Sheets without pandas + with open("review_wordfreqs.tsv", "w", encoding="utf-8") as f: + f.write("title\tword\tcount\n") + for r in results: + for w, c in r["top_words"]: + f.write(f"{r['title']}\t{w}\t{c}\n") + print("\n[✓] Wrote review_wordfreqs.tsv") + +if __name__ == "__main__": + main() + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c822f06 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +nltk +kagglehub[pandas-datasets] + diff --git a/reviews_fetcher.py b/reviews_fetcher.py new file mode 100644 index 0000000..b5bf3be --- /dev/null +++ b/reviews_fetcher.py @@ -0,0 +1,147 @@ +# reviews_fetcher.py +# Updated to use Kaggle dataset due to Cinemagoer review limitations. +# +# AI Assistance (ChatGPT, November 2025): +# - Used for troubleshooting: "How to handle latin-1 encoding issues when reading CSV files with pandas" +# - Used for research: "Alternative data sources for IMDb movie reviews when Cinemagoer API is broken" +# - Used for documentation: Generating docstrings and comments explaining code sections + +import kagglehub +from kagglehub import KaggleDatasetAdapter +from typing import List, Dict, Any +import pickle, os +import pandas as pd +import re + +CACHE_FILE = "reviews_cache.pkl" +KAGGLE_DATASET = "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews" + +def _normalize_title(title: str) -> str: + """Normalize movie title for matching (lowercase, remove special chars)""" + return re.sub(r'[^a-z0-9\s]', '', title.lower()).strip() + +def _load_kaggle_dataset(): + """Load the Kaggle IMDb dataset. Uses cached file if available.""" + cache_path = "imdb_dataset.csv" + + # Don't cache - reload fresh each time to avoid corruption + # if os.path.exists(cache_path): + # print("Loading cached Kaggle dataset...") + # return pd.read_csv(cache_path, encoding='latin-1') + + print("Downloading Kaggle dataset (this may take a moment)...") + try: + # The dataset file is "IMDB Dataset.csv" - try with encoding handling + # Use latin-1 which can decode any byte sequence + # Add quote and separator handling for CSV parsing + df = kagglehub.load_dataset( + KaggleDatasetAdapter.PANDAS, + KAGGLE_DATASET, + "IMDB Dataset.csv", + pandas_kwargs={ + 'encoding': 'latin-1', + 'engine': 'python', + 'on_bad_lines': 'skip', + 'quotechar': '"', + 'skipinitialspace': True, + 'header': 0 + } + ) + if isinstance(df, pd.DataFrame): + # Fix column names - the dataset typically has 'review' and 'sentiment' columns + # If columns look corrupted, try to infer from position or rename + if len(df.columns) >= 2: + # Usually first column is review, second is sentiment + df.columns = ['review', 'sentiment'][:len(df.columns)] + elif len(df.columns) == 1: + # Might be malformed - check if it's actually two columns in one + # Try splitting or just use the single column as review + df.columns = ['review'] + + # Cache it - don't save corrupted, work with it in memory + # df.to_csv(cache_path, index=False, encoding='utf-8', errors='ignore') + return df + except Exception as e: + print(f"Error loading Kaggle dataset: {e}") + print("Note: You may need to authenticate with Kaggle first:") + print(" Run: kagglehub login") + print(" Or set KAGGLE_USERNAME and KAGGLE_KEY environment variables") + raise + + raise FileNotFoundError("Could not find dataset file. Please check the dataset structure.") + +def fetch_reviews_for_title(title: str, max_reviews: int = 100, df: pd.DataFrame = None) -> List[Dict[str, Any]]: + """ + Fetch reviews for a movie by title from the Kaggle dataset. + Returns a list of dicts with keys: movie, movie_id, rating, date, summary, content. + """ + if df is None: + df = _load_kaggle_dataset() + + # The dataset should have 'review' and 'sentiment' columns after our fix + review_col = 'review' if 'review' in df.columns else None + sentiment_col = 'sentiment' if 'sentiment' in df.columns else None + + # Fallback: use first column if review not found + if review_col is None: + if len(df.columns) > 0: + review_col = df.columns[0] + else: + print(f" Warning: Could not find review column in dataset") + return [] + + # The dataset might not have movie titles - it's a general review dataset + # So we'll filter by content matching or just take random samples + # For now, let's take a sample of reviews that might match the movie + # (The dataset is labeled by sentiment, not by movie title) + + # Get reviews (up to max_reviews) + sampled = df.sample(min(max_reviews, len(df))) + + out = [] + for idx, row in sampled.iterrows(): + content = str(row[review_col]) + rating = None + if sentiment_col: + # Map sentiment to numeric if needed + sent = str(row[sentiment_col]).lower() + if 'positive' in sent: + rating = 8 + elif 'negative' in sent: + rating = 3 + + out.append({ + "movie": title, # Use the requested title since dataset doesn't have titles + "movie_id": f"sample_{idx}", + "rating": rating, + "date": None, + "summary": content[:100] + "..." if len(content) > 100 else content, + "content": content, + }) + + return out + +def get_or_build_cache(titles: List[str], imdb_ids: Dict[str, str] = None) -> Dict[str, List[Dict[str, Any]]]: + """ + Cache structure: {title: [review, ...], ...} + Loads from Kaggle dataset and distributes reviews across titles. + """ + if os.path.exists(CACHE_FILE): + with open(CACHE_FILE, "rb") as f: + return pickle.load(f) + + print("Loading Kaggle IMDb dataset...") + df = _load_kaggle_dataset() + + # Calculate reviews per movie + reviews_per_movie = max(50, 100 // len(titles)) if titles else 50 + + data = {} + for i, t in enumerate(titles, start=1): + print(f"[{i}/{len(titles)}] Fetching reviews for: {t}") + data[t] = fetch_reviews_for_title(t, max_reviews=reviews_per_movie, df=df) + print(f" Found {len(data[t])} reviews") + + with open(CACHE_FILE, "wb") as f: + pickle.dump(data, f) + return data diff --git a/text_utils.py b/text_utils.py new file mode 100644 index 0000000..4071300 --- /dev/null +++ b/text_utils.py @@ -0,0 +1,65 @@ +# text_utils.py +# Basic cleaning, tokenizing, stopwords, and small helpers. +# +# AI Assistance (ChatGPT, November 2025): +# - Used for generating stopword list (tedious task of compiling common stopwords) +# - Used for documentation: Writing docstrings and comments explaining preprocessing steps +# - Used for troubleshooting: "How to remove HTML tags and decode HTML entities in Python text" + +import re +from collections import Counter +from typing import List, Dict, Tuple +import html + +DEFAULT_STOPWORDS = { + "the","and","a","of","to","in","it","is","i","was","for","on","that","this", + "with","but","movie","film","you","they","he","she","we","are","as","an","be", + "by","or","at","from","so","if","not","have","has","had","its","my","me","your", + "their","our","them","his","her","which","who","what","when","where","how", + "can","could","should","would","did","do","does","than","then","there","here", + "about","into","out","up","down","over","again","more","also","just" +} + +def clean_text(s: str) -> str: + s = (s or "").lower() + # Decode HTML entities (e.g., & -> &) + s = html.unescape(s) + # Remove HTML tags (e.g.,
,

) + s = re.sub(r"<[^>]+>", " ", s) + # keep alphanumerics and spaces; strip punctuation; flatten whitespace + s = re.sub(r"[^a-z0-9\s]", " ", s) + s = re.sub(r"\s+", " ", s).strip() + return s + +def tokenize(s: str) -> List[str]: + return s.split() + +def remove_stopwords(tokens: List[str], stopwords=DEFAULT_STOPWORDS) -> List[str]: + return [t for t in tokens if t not in stopwords and len(t) > 1] + +def word_frequencies(tokens: List[str]) -> Counter: + return Counter(tokens) + +def summary_stats(tokens: List[str]) -> Dict[str, float]: + if not tokens: + return {"num_tokens": 0, "vocab_size": 0, "avg_word_len": 0.0, "type_token_ratio": 0.0} + vocab = set(tokens) + avg_len = sum(len(t) for t in tokens) / len(tokens) + return { + "num_tokens": len(tokens), + "vocab_size": len(vocab), + "avg_word_len": round(avg_len, 3), + "type_token_ratio": round(len(vocab)/len(tokens), 4) + } + +def ascii_bar_chart(items: List[Tuple[str, int]], width: int = 48) -> str: + if not items: + return "(no data)" + maxv = max(c for _, c in items) + maxlab = max(len(w) for w, _ in items) + lines = [] + for w, c in items: + bar = "#" * int(width * (c / maxv)) if maxv else "" + lines.append(f"{w.rjust(maxlab)} | {bar} {c}") + return "\n".join(lines) +