From 0dcaa96a98b7cc73c6d742fb608b76b58e9b0b5b Mon Sep 17 00:00:00 2001 From: Aditya Rana <62557376+Kingslayer4515@users.noreply.github.com> Date: Wed, 25 Dec 2024 11:59:58 +0530 Subject: [PATCH 1/3] Add files via upload --- books_scraper/__init__.py | 0 .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 159 bytes .../__pycache__/pipelines.cpython-39.pyc | Bin 0 -> 1434 bytes .../__pycache__/settings.cpython-39.pyc | Bin 0 -> 358 bytes books_scraper/items.py | 12 ++ books_scraper/middlewares.py | 103 ++++++++++++++++++ books_scraper/pipelines.py | 39 +++++++ books_scraper/readme.md | 34 ++++++ books_scraper/settings.py | 91 ++++++++++++++++ books_scraper/spiders/__init__.py | 4 + .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 167 bytes .../__pycache__/books_spider.cpython-39.pyc | Bin 0 -> 1024 bytes books_scraper/spiders/books_spider.py | 28 +++++ 13 files changed, 311 insertions(+) create mode 100644 books_scraper/__init__.py create mode 100644 books_scraper/__pycache__/__init__.cpython-39.pyc create mode 100644 books_scraper/__pycache__/pipelines.cpython-39.pyc create mode 100644 books_scraper/__pycache__/settings.cpython-39.pyc create mode 100644 books_scraper/items.py create mode 100644 books_scraper/middlewares.py create mode 100644 books_scraper/pipelines.py create mode 100644 books_scraper/readme.md create mode 100644 books_scraper/settings.py create mode 100644 books_scraper/spiders/__init__.py create mode 100644 books_scraper/spiders/__pycache__/__init__.cpython-39.pyc create mode 100644 books_scraper/spiders/__pycache__/books_spider.cpython-39.pyc create mode 100644 books_scraper/spiders/books_spider.py diff --git a/books_scraper/__init__.py b/books_scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/books_scraper/__pycache__/__init__.cpython-39.pyc b/books_scraper/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..129e298c872989e659790c35ff38c95335051672 GIT binary patch literal 159 zcmYe~<>g`k0@32kbP)X*L?8o3AjbiSi&=m~3PUi1CZpdI zSPc?L^koBFR*E7B8-B!r*%jHnv?5uawlmv1J$7Sv z5@j@dVfP6*=P)BqyawVaeC5QA3m3$JD))>uLZMnNf8}ylm8;6;+uL0NbhDg{dkG=G zqp(RK2*)t&Jz#9|3E~0^(TG zIbp$^ya+_FPXLF2!HdU{AQUTnI|pLqx`nPPSu~|q8Fi7B)%h>Pq!8cV zU!5GDJ+so-voA#1T=T*AT=BCnrJXdoK5M<&_Qd~@>Qw^lgZkQ~jFqZv7>n@aP%uJ* z9_`WDd$;B6!FqRr$y!qkpaa7v?>7K*NZmd$d+od+OQIlwC>@~=(q|quv_*?RK)IFy z?t10dj)+P{g4!b@IwBQ!L|1Hy-c9^6oQI1LG~4?`Bulio?Je$r#UDX}c@cSU@ykf; z&ZA3WeiD7~{swAcFL@C!&^E*e91H2Kp%!euc@r%ug*7PbTSM1d6^ki8m9yPvYNGU& z8a!!B=;GRxh5QRU^4Fgo8N=*p1^)bey>h$)=^FIcVDC7K^YF2fdHD27=ke3$M#?<; zT3;C8<)w6q;Z3Q=uE#HVS@BU>LA7UTsRp*u#blO#s|KeY^&Vd|1AX2~S!A{#7}GwbU>$uo9rmS6jUTNOAj*)f63+ckKY+PLFD`^aNLEbGDbXUW)l38pzu;ik$kyDr0 zx)jpn-8L{b<#paS{EBS`=zFwfckoxtWFf6(XaXMydmk7H-VIZFKS;wq-3#x+Na*VO zCrxJW-`awkmI}=Mnh@jgZMXvfiVe4TNhrKCP*6I|0)uVt;-b=4f`rF1ZzIDywHdG9 z!P=vK2s-OM1^*M%=I&vZ;H|=zVa#!5ce4S;euDK#$XoAU1azYZUo>%!K@kd ii5Jb^%ao=``i@_N$Z^@@+kSNEQS0g`kf)jJI(>r(=7#@Q-$bbpRaRB0CIUte35XF$fn9dl*n8Fyvl)@CnoWh*K zlF1mwlENCypve}+o0OlQT^wJWT$EUlT2v*9!qqD-$V^EsDh{dAK@~5^EJ)4C%u6lS za{_4(hUgA}ie~CCGBEgQvfkow@(+pkbM$q+#TOjl>Eap`@9Xap>f;)GOTf=H97X6B zUywggb#O>TNW8z3Yve6H&k$GN_yErUS07J5*Wi^5MQlL-fQer&&Q>v@#XyI}IHqKl zR3<6}CFUi@xTF?mm*f}3pg0wcSDac>l9`uYtXEKZi^C>2KczG$)eaP%#UO%(iH(tk JoyCpiKL90}X}g`k0@32k^o2nBF^Gc<7=auIATDMB5-AM944RC7D;bJF!U*D*kF!-wXmM&$ zag1Y1W=UnDLQrB}VvI{_adt_5K}=G9es*zuadJ^&L23~iuecyH1*9)NJ~J<~BtBlR Wpz;=nO>TZlX-=vg$d=DQ%m4s3H!E!b literal 0 HcmV?d00001 diff --git a/books_scraper/spiders/__pycache__/books_spider.cpython-39.pyc b/books_scraper/spiders/__pycache__/books_spider.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b4a84bb9a714d19efaf7fa83c19d5fd0ffed554 GIT binary patch literal 1024 zcmZ8gL2uJA6t-h0O-olcCN5|~J1#UAtpFPMUfY~T{f@DY>%-hJFyc7 zegP8ek-y|C2gF}M9C&s+sBq*be=pD9_w4t~R#til#*eQj6G{>K?!fW@AKHPbAA;eC zV~H~K67hhOYs85_$M_@30*((*LVrRG+Ao}49sz(In4UwR40GgmWP}G3oKa3Vy+&Ec zL*R()o!zQB(Fb+Oh06Me4&>US=E0FM^>BOJJ5y8XLa|z;MRmGeIFZ}Yg+&C@WuiZW zY4HuDKEr$H-RLT413l!X*&ttx(An7podvv%}k zlMRQ=7`0_eBSq4z*AP(_A}=agso~Iw52hK^i=aX>t&?U=8>SvBX3FVgb8B?*C}}q8 zw47>F6(=cs&q~P-OIe!BIe~>*+5WBal${D2dKDYZNo;?4Q*&RI=@f+jCU>O7I7wnl z3au@f2xDU<@^PuOu@OKfrKQJJIkllSs;q4^uB5CkK!zzTlCGs%$g$g{>uRb+MqN#n z>oyfSV8J$laJlx*`go3hcK&?ly< zM~fcv&7%H8^Ul(@Uf*YFeW~sOtrOE5U=SVz13bX1ct8du!2bK)iwBtEW^MOhzo!0p z`_RR4p1XU^bKA@FQz)@?>wcb}oin-kL;cP7T=7hPuB7&={zOY=9rTs!ZUR%gdx=3d k3cN2NHuA6F(nj+q Date: Wed, 25 Dec 2024 21:43:36 +0530 Subject: [PATCH 2/3] Add files via upload --- readme.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 readme.md diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..3015bbb --- /dev/null +++ b/readme.md @@ -0,0 +1,88 @@ + +# Books Web Scraper + +This project is a web scraper built using the Scrapy framework. It crawls the [Books to Scrape](http://books.toscrape.com/) website to extract product details such as name, price, rating, and availability. The extracted and cleaned data is stored in a MongoDB database. + +--- + +## Overview + +The scraper performs the following tasks: +1. Crawls the website to extract: + - Product Name + - Price + - Rating + - Availability +2. Cleans and processes the extracted data: + - Converts price to float + - Maps ratings to numerical values + - Standardizes availability to "In Stock" or "Out of Stock" +3. Stores the processed data into a MongoDB database with the schema: + - `product_name` (string) + - `price` (float) + - `rating` (float) + - `availability` (string) + +--- + +## Setup and Running the Scraper + +### Prerequisites +- Python 3.7 or above +- MongoDB installed and running locally +- pip (Python package manager) +- **Scrapy**: For web scraping. +- **pymongo**: To interact with MongoDB. +- **Rust Compiler**: Required for building certain Python packages (e.g., cryptography). + +--- + +### Installation + +1. Clone the repository: + ```bash + git clone + cd + ``` + +2. Create and activate a virtual environment: + ```bash + python -m venv venv + venv\Scripts\activate + ``` + +3. Install dependencies: + ```bash + pip install scrapy pymongo + ``` + +4. Ensure MongoDB is running locally + + + + + +### Running the Crawler + +1. Navigate to the project directory: + ```bash + cd bookscraper + ``` + +2. Run the Scrapy spider: + ```bash + scrapy crawl books_spider + ``` + +3. Check MongoDB Compass to verify the scraped data in the `books_database` under the `products` collection. + +--- + +## Dependencies +- **Scrapy**: For web crawling and data extraction. +- **pymongo**: To interact with MongoDB. +- **Rust**: Required for certain Python dependencies (e.g., cryptography). + + + + From 6ca26835f670cf719693dcc9865dc950884118dc Mon Sep 17 00:00:00 2001 From: Aditya Rana <62557376+Kingslayer4515@users.noreply.github.com> Date: Wed, 25 Dec 2024 21:45:06 +0530 Subject: [PATCH 3/3] Delete README.md --- README.md | 73 ------------------------------------------------------- 1 file changed, 73 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 3ae235b..0000000 --- a/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# Data Engineering Internship Assignment - -Welcome to the Data Engineering Internship Assignment! This task is designed to evaluate your problem-solving skills, understanding of data pipelines, and ability to work with web crawling and data extraction. Please read the instructions carefully and submit your solution as per the guidelines provided. - -## Problem Statement - -You are tasked with building a basic web crawling pipeline to extract and process data from a target website. The goal is to: - -1. **Crawl** a given webpage to extract specific information. -2. **Clean and process** the extracted data. -3. **Store** the processed data into a MongoDB database. - -### Target Website - -You will be working with the Books to Scrape website (http://books.toscrape.com/) or any other publicly accessible e-commerce website containing product information. Ensure that your crawler abides by the website's `robots.txt` policy. - -### Tasks - -#### Step 1: Web Crawling - -1. Use the `Scrapy` framework to: - - Fetch the HTML content of the target webpage. - - Extract product details such as: - - `Product Name` - - `Price` - - `Rating` - - `Availability Status` - -#### Step 2: Data Transformation - -1. Clean the extracted data (e.g., remove extra whitespace, convert prices to float, handle missing ratings). -2. Standardize the data (e.g., convert availability status to `In Stock` or `Out of Stock`). - -#### Step 3: Data Storage - -1. Store the processed data into a MongoDB database. -2. Use a collection named `products` with the following schema: - - `product_name` (string) - - `price` (float) - - `rating` (float) - - `availability` (string) - -#### Step 4: Documentation - -Prepare a `README.md` file that includes: -1. An overview of your solution. -2. Steps to set up and run your crawler. -3. Dependencies and setup instructions. - -#### Step 5: Git Guidelines - -1. Use meaningful commit messages. -2. Follow a proper branch naming convention (e.g., `feature/`). -3. Ensure your code is clean, modular, and well-commented. - -## Submission Guidelines - -1. Fork this repository. -2. Create a new branch named `submission/`. -3. Commit your code and push it to your forked repository. -4. Create a Pull Request (PR) to the `main` branch of this repository. -5. Include your `README.md` and ensure your code is well-documented. - -## Evaluation Criteria - -1. **Correctness**: Does your solution meet the requirements? -2. **Code Quality**: Is your code clean, modular, and well-documented? -3. **Efficiency**: Are the crawling and transformations optimized? -4. **Git Practices**: Are proper git guidelines followed? - ---- - -Good luck! We look forward to reviewing your submission.