diff --git a/Apple_Inc._top10.png b/Apple_Inc._top10.png new file mode 100644 index 0000000..7914df6 Binary files /dev/null and b/Apple_Inc._top10.png differ diff --git a/Apple_Inc._wordcloud.png b/Apple_Inc._wordcloud.png new file mode 100644 index 0000000..e7e27c0 Binary files /dev/null and b/Apple_Inc._wordcloud.png differ diff --git a/Microsoft_top10.png b/Microsoft_top10.png new file mode 100644 index 0000000..c4e37d0 Binary files /dev/null and b/Microsoft_top10.png differ diff --git a/Microsoft_wordcloud.png b/Microsoft_wordcloud.png new file mode 100644 index 0000000..783b010 Binary files /dev/null and b/Microsoft_wordcloud.png differ diff --git a/README.md b/README.md index 05aa109..f87c5b0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,47 @@ # Text-Analysis-Project Please read the [instructions](instructions.md). + +Project Overview: + +For this project, I did my best to apply the TF-IDF methods I learned in my Machine Learning class. However, although I had a solid idea of tokenization and such, I had issues performing those methods in Python rather than R Studio, which is something that AI helped me tremendously. +For this project I collected wikipedia articles as my main data source through the MediaWiki API from the mediawiki Python library. I collected data particularly from Apple, Microsoft and Samsung to compare the most frequently used terms in each article and identify the top TF-IDF keywords unique to each company since I wanted to find the differences of focus between companies within the same industry. + +Implementation: +I used multiple Python files: + - fetch.py (Uses MediaWiki to fetch article text for each topic.) + - clean.py (Cleans and tokenizes text by removing punctuation, converting to lowercase, and removing stop words) + - tfidf.py (Calculates TF, IDF, and TF-IDF scores using dictionaries and counters) + - visualize.py (Generates bar charts of the top keywords and word clouds to visualize the most important terms) + - main.py (Integrates all of the files above mentioned and also saves the output graphs in a figures/ folder)![alt text](image-1.png) + +I applied the term frequency–inverse document frequency (TF–IDF) method to identify which words were most unique to each company’s article. Term frequency (TF) captured how often a word appeared in one document, while inverse document frequency (IDF) measured how rare it was across all three documents. The multiplication of the two gave each term a weight that emphasized words distinctive to a specific company. I then visualized the results with matplotlib bar charts and word clouds, saving each plot to a /figures/ directory. + +A key design decision I faced was whether to analyze the text using TF-IDF or a sentiment analysis approach with libraries like NLTK. I chose TF-IDF because I was more familiar with it than NLTK. + +Some issues I had, which made me glad that I kept on checking my results was this (![alt text](image.png)) since I had to make sure that I got rid of the Samsung('s) or else the TF-IDF will simply yield another result that doesn't matter that much. +Throughout the development of this project, I used ChatGPT to help debug TF-IDF weighting logic, manage file paths, and structure my functions cleanly. + + +Results: +Apple Top 10 TF-IDF![alt text](image-2.png) +Microsoft Top 10 TF-IDF ![alt text](../figures/Microsoft_top10.png) +Samsung Top 10 TF-IDF ![alt text](../figures/Samsung_Electronics_top10.png) + +Apple WordCloud ![alt text](image-3.png) +Microsoft WordCloud ![alt text](../figures/Microsoft_wordcloud.png) +Samsung WordCloud ![alt text](../figures/Samsung_Electronics_wordcloud.png) + +For Apple, top keywords like apple, jobs, macintosh, and iphone highlight the company’s focus on product design and its strong link to co-founders Steve Jobs and Steve Wozniak. +Microsoft stood out with terms like windows, azure, xbox, and gates, showing its history in software, gaming, and cloud computing. +For Samsung, words such as galaxy, semiconductor, and design reflect its strength in hardware, electronics, and large-scale production. + +Overall, the graphs and word clouds show that each company’s Wikipedia article reflects its core identity — Apple with innovation and products, Microsoft with software and cloud systems, and Samsung with manufacturing and technology. + +Reflection: + +When starting this project, I knew I'll use AI alot, but I know the code might not always work whenever I asked it to do something. So before I started, I decided to be very direct and specific with what I wanted to do and created a step by step process to achieve it. Fetching first, then cleaning...., which made the procress much smoother. Initially I wanted to conduct sentiment analysis on Amazon products since it would help users find the most loved producted. I wish I was better at organizing multiple files and functions across different modules since it felt confusing at first. +My biggest takeaways is that I should have a step by step plan when writing code. Just like how we need a docstring for functions, we need a plan for a project. +Overall, this project taught me not only about text analytics, but also about writing cleaner, modular Python code that’s easy to maintain and scale. + + diff --git a/Samsung_Electronics_top10.png b/Samsung_Electronics_top10.png new file mode 100644 index 0000000..2f12c56 Binary files /dev/null and b/Samsung_Electronics_top10.png differ diff --git a/Samsung_Electronics_wordcloud.png b/Samsung_Electronics_wordcloud.png new file mode 100644 index 0000000..f42d20c Binary files /dev/null and b/Samsung_Electronics_wordcloud.png differ diff --git a/clean.py b/clean.py new file mode 100644 index 0000000..56dbff0 --- /dev/null +++ b/clean.py @@ -0,0 +1,74 @@ +# clean the feteched data + +import unicodedata +import re +import nltk +from nltk.corpus import stopwords + + +def split_line(line:str): # Finds split lines so tokenization is easier + return line.replace('-', ' ').replace('–', ' ').replace('—', ' ').split() + +def find_punctuation(text:str): # Finds punctuation in text to remove + punc_marks = {} + for char in text: + category = unicodedata.category(char) + if category.startswith("P"): + punc_marks[char] = 1 + return "".join(punc_marks) + +def clean_word(word: str, punctuation: str): + return word.strip(punctuation).lower() + +POSSESSIVE_RE = re.compile(r"[’']s\b") # samsung's -> samsung +TRAILING_APOS_RE = re.compile(r"s[’']\b") # companies' -> companies +URL_RE = re.compile(r'(https?://\S+|www\.[^\s]+)', re.IGNORECASE) +DIGITS_RE = re.compile(r'\d+') +MULTISPACE_RE = re.compile(r'\s+') + + +def clean_basic(text: str) -> str: + """Remove URLs, possessives, digits/numbers, and collapse whitespace. + I am familiar with the reason we needed to preprocess and clean the data, and AI + helped suggest a way to do it""" + # remove URLs + text = URL_RE.sub(' ', text) + # normalize curly and straight apostrophes first (to make regex consistent) + text = text.replace("’", "'").replace("‘", "'") + # remove possessive endings (e.g., samsung's → samsung) + text = re.sub(r"\b(\w+)'s\b", r"\1", text) + # remove plural possessives (e.g., companies' → companies) + text = re.sub(r"\b(\w+)'\b", r"\1", text) + # remove digits and numbers (e.g., 2024, 10,000) + text = DIGITS_RE.sub(' ', text) + # collapse multiple spaces/newlines/tabs + text = MULTISPACE_RE.sub(' ', text).strip() + return text + +def clean_then_strip_punct_and_lower(text: str) -> list[str]: # Tokenizes Words + cleaned = clean_basic(text) + punctuation = find_punctuation(cleaned) + words = [] + for line in cleaned.splitlines(): + for w in split_line(line): + w = clean_word(w, punctuation) + if w: + words.append(w) + return words +# download once (safe to leave here) +nltk.download('stopwords', quiet=True) + +# base English stopwords +STOP = set(stopwords.words('english')) + +# optional: domain-specific stopwords for Wikipedia/company pages (AI helped generate this idea to me since it made sense to get rid of generic words) +EXTRA_STOP = { + 'inc', 'ltd', 'co', 'corp', 'corporation', 'company', + 'electronics', 'technology', 'software', 'services', + 'usa', 'us', 'u', 's', '===','==','=', # sometimes appear after punctuation removal and AI helped me get rid of these words by suggesting this method +} +STOP |= EXTRA_STOP + +def remove_stopwords(tokens: list[str]) -> list[str]: + """Filter out stop words and very short leftovers.""" + return [t for t in tokens if t not in STOP and len(t) > 2] diff --git a/fetch.py b/fetch.py new file mode 100644 index 0000000..cca7676 --- /dev/null +++ b/fetch.py @@ -0,0 +1,39 @@ +# fetch.py +# Fetch multiple Wikipedia articles by title and return {title: content} + +from mediawiki import MediaWiki +from mediawiki.exceptions import DisambiguationError, PageError + +def fetch_articles(titles: list[str]) -> dict[str, str]: + wikipedia = MediaWiki() + articles: dict[str, str] = {} + + for title in titles: + try: + # normal fetch + page = wikipedia.page(title) + articles[title] = page.content + + except DisambiguationError as e: + # if ambiguous, try the first suggested option + try: + alt = e.options[0] + page = wikipedia.page(alt) + articles[title] = page.content + print(f"⚠️ '{title}' was ambiguous; used '{alt}'.") + except Exception as e2: + print(f"⚠️ Could not fetch {title} (disambiguation): {e2}") + + except PageError as e: + print(f"⚠️ Could not fetch {title}: {e}") + + except Exception as e: + print(f"⚠️ Could not fetch {title}: {e}") + + return articles + +if __name__ == "__main__": + # tiny self-test + data = fetch_articles(["Tesla"]) + for k in data: + print("Fetched:", k, "chars:", len(data[k])) diff --git a/main.py b/main.py new file mode 100644 index 0000000..0fb9202 --- /dev/null +++ b/main.py @@ -0,0 +1,65 @@ +from fetch import fetch_articles +from mediawiki import MediaWiki +from clean import clean_then_strip_punct_and_lower, remove_stopwords +from tfidf import compute_tf, compute_idf, compute_tfidf +from visualize import plot_top_keywords, generate_wordcloud +from collections import Counter +from visualize import plot_top_keywords, generate_wordcloud +import os + +def compute_word_frequency(tokens): + """Count raw word frequencies for a document.""" + return Counter(tokens) + +def main(): + topics = [ "Apple Inc.", "Microsoft", "Samsung Electronics"] + articles = fetch_articles(topics) + + cleaned_docs = {} + + # Step 1: Clean & preprocess + for title, content in articles.items(): + tokens = clean_then_strip_punct_and_lower(content) + tokens_no_stop = remove_stopwords(tokens) + cleaned_docs[title] = tokens_no_stop + + # Step 2a: Word frequency (raw counts) BEFORE TF-IDF + for title, tokens in cleaned_docs.items(): + word_freq = compute_word_frequency(tokens) # Counter + top_words = word_freq.most_common(10) + print(f"\n--- {title}: Top 10 Frequent Words ---") + for word, freq in top_words: + print(f"{word:<15} {freq}") + + # Step 2b: Compute IDF across all documents + idf = compute_idf(cleaned_docs.values()) + + outdir = "figures" + os.makedirs(outdir, exist_ok=True) + + # Step 3: Compute TF-IDF for each document + for title, tokens in cleaned_docs.items(): + tfidf_scores = compute_tfidf(tokens, idf) + + # get top 10 keywords by TF-IDF score + top_keywords = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10] + + print(f"{title}: TF-IDF terms count =", len(tfidf_scores)) + print(f"\n--- {title} ---") + + for word, score in top_keywords: + print(f"{word:<20} {score:.4f}") + print("-" * 60) + + # Visualize + plot_top_keywords( + tfidf_scores, title, n=10, + save_path=f"{outdir}/{title.replace(' ', '_')}_top10.png" + ) + generate_wordcloud( + tfidf_scores, title, + save_path=f"{outdir}/{title.replace(' ', '_')}_wordcloud.png" + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tfidf.py b/tfidf.py new file mode 100644 index 0000000..2245ef3 --- /dev/null +++ b/tfidf.py @@ -0,0 +1,26 @@ +import math +from collections import Counter, defaultdict + +def compute_tf(tokens): # AI/Chat helped me perform the calculations for TF and IDF, which was taught to me in my machine learning class + """Compute term frequency for one document.""" + tf = Counter(tokens) + total = sum(tf.values()) + return {term: freq / total for term, freq in tf.items()} + +def compute_idf(all_docs): + """Compute inverse document frequency from multiple docs.""" + N = len(all_docs) + idf = defaultdict(int) + + # count in how many docs each term appears + for doc in all_docs: + for term in set(doc): + idf[term] += 1 + + # compute IDF + return {term: math.log((N + 1) / (df + 1)) + 1 for term, df in idf.items()} + +def compute_tfidf(tokens, idf): + """Compute TF-IDF for one document given its IDF dictionary.""" + tf = compute_tf(tokens) + return {term: tf_val * idf.get(term, 0) for term, tf_val in tf.items()} diff --git a/visualize.py b/visualize.py new file mode 100644 index 0000000..0c9bf1f --- /dev/null +++ b/visualize.py @@ -0,0 +1,51 @@ +# visualize.py +import os +import matplotlib.pyplot as plt +from wordcloud import WordCloud + +def plot_top_keywords(tfidf_scores, title, n=10, save_path=None): + """Bar chart of top n TF-IDF terms. If save_path is given, saves PNG instead of showing. + My experience wtih TF-IDF was with R so I needed alot of help from Ai to be able to perform this task""" + + top_items = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:n] + if not top_items: + return + words, scores = zip(*top_items) + + plt.figure(figsize=(8, 4)) + plt.barh(words, scores) + plt.gca().invert_yaxis() + plt.title(f"Top {n} TF-IDF Keywords – {title}") + plt.xlabel("TF-IDF Score") + plt.tight_layout() + + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + plt.savefig(save_path, dpi=150) + plt.close() + else: + plt.show() + + +def generate_wordcloud(tfidf_scores, title, save_path=None): + """Word cloud based on TF-IDF weights. If save_path is given, saves PNG instead of showing. + I am familiar with how word clouds work and their functionality, + but still, since it was new to me in python, I heavily needed help from AI""" + if not tfidf_scores: + return + + wc = WordCloud(width=800, height=400, background_color='white') + wc.generate_from_frequencies(tfidf_scores) + + plt.figure(figsize=(8, 4)) + plt.imshow(wc, interpolation='bilinear') + plt.axis('off') + plt.title(f"Word Cloud – {title}") + plt.tight_layout() + + if save_path: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + plt.savefig(save_path, dpi=150) + plt.close() + else: + plt.show()