Skip to content

Conversation

@rogeliohdev
Copy link
Collaborator

Scrapper Definitive 🥵

Description:

  • This pull request represents a significant enhancement to the scraper's control, improving the accessibility to product data.
  • I've compared the previous version with the updated one, highlighting key improvements in the code structure and functionality.

Changes Made:

  • Introduced the ProductClassifier class to classify products based on keywords in the title and description.
  • Refactored the code to separate data extraction by category into specific methods.
  • Updated data extraction methods to include more detailed information about different types of products.

Before and After:

Before:

# Original code here
from bs4 import BeautifulSoup
import json
import requests

class Scrapper:
    def __init__(self, url):
        self.url = url

    def try_extract_text(self, soup, identifier, method='find', tag_type='span', class_name=None, default="No encontrado"):
        try:
            if method == 'find':
                if class_name:
                    element = soup.find(tag_type, class_=class_name)
                else:
                    element = soup.find(tag_type, string=identifier).find_next('td')
            elif method == 'find_all':
                element = soup.find_all(tag_type, string=identifier)[0].find_next('td')
            return element.get_text(strip=True) if element else default
        except (AttributeError, IndexError):
            return default

    def general_extract(self, soup, search_params, default="No encontrado"):
        try:
            element = soup.select_one(search_params) if search_params else None
            return element.get_text(strip=True) if element else default
        except (AttributeError, IndexError):
            return default

    def scrape(self):
        data = {}
        response = requests.get(self.url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extracción de datos usando las funciones de extracción
        data['product_name'] = self.general_extract(soup, 'span#productTitle')
        data['rating'] = self.general_extract(soup, 'span.a-size-base.a-color-base')
        data['rating_count'] = self.general_extract(soup, 'span#acrCustomerReviewText')
        data['marca'] = self.try_extract_text(soup, 'Marca')
        data['modelo'] = self.try_extract_text(soup, 'Nombre del modelo')
        data['software'] = self.try_extract_text(soup, 'Sistema operativo')
        data['tecnologia'] = self.try_extract_text(soup, 'Tecnología celular')
        data['capacidad'] = self.try_extract_text(soup, 'Capacidad de almacenamiento de la memoria')
        data['Tamano'] = self.try_extract_text(soup, 'Tamaño de la pantalla')

        # Nuevos elementos a extraer con manejo de errores
        fields = [
            ('Tipo de tejido', 'tejido'),
            ('Instrucciones de cuidado del material', 'washing'),
            ('Tipo de cierre', 'zipper'),
            ('Tipo de manga', 'manga')
        ]

        for field_name, field_key in fields:
            try:
                row = soup.find('span', string=field_name).find_parent('div', {'class': 'a-fixed-left-grid-inner'})
                data[field_key] = row.find('div', class_='a-fixed-left-grid-col a-col-right').get_text(strip=True) if row else f"{field_name} no encontrado"
            except AttributeError:
                data[field_key] = f"{field_name} no encontrado"
        
        # Guardar los datos recopilados en un archivo JSON
        with open("scraped_data.json", 'w', encoding='utf-8') as json_file:
            json.dump([data], json_file, indent=4, ensure_ascii=False)
        print("Datos guardados en scraped_data.json")

# Uso de la clase Scrapper
url = 'https://www.amazon.com.mx/Im-Glad-My-Mom-Died/dp/1982185821/ref=pd_sim_d_sccl_2_2/137-9977645-3285747?pd_rd_w=qcC30&content-id=amzn1.sym.6dc04e10-649c-403d-a982-6e188819ce76&pf_rd_p=6dc04e10-649c-403d-a982-6e188819ce76&pf_rd_r=R8RHWZ50KN6MD320M2BB&pd_rd_wg=VUinI&pd_rd_r=cdd5c84c-599e-4750-a7a6-9424d3083445&pd_rd_i=1982185821&psc=1'
objeto = Scrapper(url)
objeto.scrape()

After

from bs4 import BeautifulSoup
import json
import requests

class ProductClassifier:
    @staticmethod
    def classify_product(soup):
        title_element = soup.select_one('span#productTitle')
        if title_element:
            title = title_element.get_text(strip=True)
        else:
            title = "No encontrado"
        
        description_element = soup.select_one('meta[name="description"]')
        if description_element:
            description = description_element.get('content', '').lower()
        else:
            description = ""
        
        # Categorías basadas en palabras clave en el título o descripción
        if any(keyword in title.lower() for keyword in ['teléfono', 'smartphone', 'tablet']):
            return 'Tecnología'
        elif any(keyword in title.lower() for keyword in ['zapatos', 'ropa', 'moda', 'vestido']):
            return 'Moda'
        elif any(keyword in title.lower() for keyword in ['hogar', 'casa', 'mueble', 'decoración']):
            return 'Hogar y Cocina'
        elif any(keyword in title.lower() for keyword in ['libro', 'ebook', 'novela', 'literatura']):
            return 'Libros'
        elif any(keyword in description for keyword in ['alimentación', 'comida', 'bebida']):
            return 'Alimentación y Bebidas'
        elif any(keyword in title.lower() for keyword in ['deportes', 'ejercicio', 'fitness', 'gimnasio']):
            return 'Deportes y Aire Libre'
        elif any(keyword in title.lower() for keyword in ['juguete', 'juego', 'muñeca', 'infantil']):
            return 'Juguetes y Juegos'
        else:
            return 'Otro'

class Scrapper:
    def __init__(self, url):
        self.url = url

    def try_extract_text(self, soup, identifier, method='find', tag_type='span', class_name=None, default="No encontrado"):
        try:
            if method == 'find':
                if class_name:
                    element = soup.find(tag_type, class_=class_name)
                else:
                    element = soup.find(tag_type, string=identifier).find_next('td')
            elif method == 'find_all':
                element = soup.find_all(tag_type, string=identifier)[0].find_next('td')
            return element.get_text(strip=True) if element else default
        except (AttributeError, IndexError):
            return default

    def general_extract(self, soup, search_params, default="No encontrado"):
        try:
            element = soup.select_one(search_params) if search_params else None
            return element.get_text(strip=True) if element else default
        except (AttributeError, IndexError):
            return default

    def scrape(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Clasificar el producto
        category = ProductClassifier.classify_product(soup)
        
        # Inicializar el diccionario de datos con la categoría
        data = {'category': category}

        # Extracción de datos según la categoría
        if category == 'Tecnología':
            data.update(self.extract_technology_info(soup))
        elif category == 'Moda':
            data.update(self.extract_fashion_info(soup))
        elif category == 'Hogar y Cocina':
            data.update(self.extract_home_and_kitchen_info(soup))
        elif category == 'Libros':
            data.update(self.extract_books_info(soup))
        elif category == 'Alimentación y Bebidas':
            data.update(self.extract_food_and_drink_info(soup))
        elif category == 'Deportes y Aire Libre':
            data.update(self.extract_sports_and_outdoors_info(soup))
        elif category == 'Juguetes y Juegos':
            data.update(self.extract_toys_and_games_info(soup))
        else:
            data.update(self.extract_generic_info(soup))
        
        # Guardar los datos recopilados en un archivo JSON
        with open("scraped_data.json", 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
        print("Datos guardados en scraped_data.json")

    # Métodos para la extracción de información específica por categoría
    def extract_technology_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'marca': self.try_extract_text(soup, 'Marca'),
            'modelo': self.try_extract_text(soup, 'Nombre del modelo'),
            'software': self.try_extract_text(soup, 'Sistema operativo'),
            'tecnologia_celular': self.try_extract_text(soup, 'Tecnología celular'),
            'capacidad': self.try_extract_text(soup, 'Capacidad de almacenamiento de la memoria'),
            'tecnologia_conectividad': self.try_extract_text(soup, 'Tecnología de conectividad'),
            'color': self.try_extract_text(soup, 'Color'),
            'tamano_pantalla': self.try_extract_text(soup, 'Tamaño de la pantalla'),
            'tipo_conector': self.try_extract_text(soup, 'Tipo de conector'),
            'duracion_bateria': self.try_extract_text(soup, 'Duración de la batería'),
            'resolucion_camara': self.try_extract_text(soup, 'Resolución de la cámara'),
            'tipo_procesador': self.try_extract_text(soup, 'Tipo de procesador'),
        }  
        return {'technology_info': data}

    def extract_fashion_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'marca': self.try_extract_text(soup, 'Marca'),
            'material': self.try_extract_text(soup, 'Material'),
            'tamanio': self.try_extract_text(soup, 'Tamaño'),
            'deporte': self.try_extract_text(soup, 'Deporte'),
            'rango_edad': self.try_extract_text(soup, 'Rango de edad (descripción)'),
            'tipo_cierre': self.try_extract_text(soup, 'Tipo de cierre'),
            'estilo_cuello': self.try_extract_text(soup, 'Estilo del cuello'),
            'largo_manga': self.try_extract_text(soup, 'Largo de la manga'),
            'estilo_manga': self.try_extract_text(soup, 'Estilo de la manga'),
            'largo_pantalon': self.try_extract_text(soup, 'Largo del pantalón'),
            'tipo_tejido': self.try_extract_text(soup, 'Tipo de tejido'),
        }
        return {'fashion_info': data}

    def extract_home_and_kitchen_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'marca': self.try_extract_text(soup, 'Marca'),
            'dimensiones': self.try_extract_text(soup, 'Dimensiones del producto'),
            'peso': self.try_extract_text(soup, 'Peso del producto'),
            'material': self.try_extract_text(soup, 'Material'),
            'capacidad': self.try_extract_text(soup, 'Capacidad'),
            'color': self.try_extract_text(soup, 'Color'),
            'tipo_superficie': self.try_extract_text(soup, 'Tipo de superficie'),
            'voltaje': self.try_extract_text(soup, 'Voltaje'),
            'potencia': self.try_extract_text(soup, 'Potencia'),
            'incluye_bateria': self.try_extract_text(soup, 'Incluye baterías'),
            'requiere_bateria': self.try_extract_text(soup, 'Requiere baterías'),
            'energia_eficiente': self.try_extract_text(soup, 'Eficiencia energética'),
        }
        return {'home_and_kitchen_info': data}

    def extract_books_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'autor': self.try_extract_text(soup, 'Autor'),
            'editorial': self.try_extract_text(soup, 'Editorial'),
            'formato': self.try_extract_text(soup, 'Formato'),
            'paginas': self.try_extract_text(soup, 'Número de páginas'),
            'idioma': self.try_extract_text(soup, 'Idioma'),
            'genero': self.try_extract_text(soup, 'Género'),
            'dimensiones': self.try_extract_text(soup, 'Dimensiones del producto'),
            'peso': self.try_extract_text(soup, 'Peso del producto'),
            'fecha_publicacion': self.try_extract_text(soup, 'Fecha de publicación'),
        }
        return {'books_info': data}

    def extract_food_and_drink_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'marca': self.try_extract_text(soup, 'Marca'),
            'peso_neto': self.try_extract_text(soup, 'Peso neto'),
            'ingredientes': self.try_extract_text(soup, 'Ingredientes'),
            'origen': self.try_extract_text(soup, 'Origen'),
            'unidad_medida': self.try_extract_text(soup, 'Unidad de medida'),
            'fecha_vencimiento': self.try_extract_text(soup, 'Fecha de vencimiento'),
            'instrucciones_alergenos': self.try_extract_text(soup, 'Instrucciones de alergenos'),
            'conservacion': self.try_extract_text(soup, 'Instrucciones de conservación'),
        }
        return {'food_and_drink_info': data}

    def extract_sports_and_outdoors_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'marca': self.try_extract_text(soup, 'Marca'),
            'tipo': self.try_extract_text(soup, 'Tipo de deporte'),
            'material': self.try_extract_text(soup, 'Material'),
            'dimensiones': self.try_extract_text(soup, 'Dimensiones del producto'),
            'edad_recomendada': self.try_extract_text(soup, 'Edad recomendada'),
            'color': self.try_extract_text(soup, 'Color'),
            'talla': self.try_extract_text(soup, 'Talla'),
            'genero': self.try_extract_text(soup, 'Género'),
            'estilo': self.try_extract_text(soup, 'Estilo'),
            'temporada': self.try_extract_text(soup, 'Temporada'),
        }
        return {'sports_and_outdoors_info': data}

    def extract_toys_and_games_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'marca': self.try_extract_text(soup, 'Marca'),
            'edad_recomendada': self.try_extract_text(soup, 'Edad recomendada'),
            'material': self.try_extract_text(soup, 'Material'),
            'dimensiones': self.try_extract_text(soup, 'Dimensiones del paquete'),
            'genero': self.try_extract_text(soup, 'Género'),
            'numero_modelo': self.try_extract_text(soup, 'Número de modelo'),
            'requiere_baterias': self.try_extract_text(soup, 'Requiere baterías'),
            'incluye_baterias': self.try_extract_text(soup, 'Incluye baterías'),
            'numero_piezas': self.try_extract_text(soup, 'Número de piezas'),
            'instrucciones_cuidado': self.try_extract_text(soup, 'Instrucciones de cuidado'),
        }
        return {'toys_and_games_info': data}

    def extract_generic_info(self, soup):
        data = {
            'product_name': self.general_extract(soup, 'span#productTitle'),
            'rating': self.general_extract(soup, 'span.a-size-base.a-color-base'),
            'rating_count': self.general_extract(soup, 'span#acrCustomerReviewText'),
            'marca': self.try_extract_text(soup, 'Marca'),
            'descripcion': self.try_extract_text(soup, 'Descripción'),
            'dimensiones': self.try_extract_text(soup, 'Dimensiones del producto'),
            'peso': self.try_extract_text(soup, 'Peso del producto'),
            'material': self.try_extract_text(soup, 'Material'),
            'color': self.try_extract_text(soup, 'Color'),
            'otras_caracteristicas': self.try_extract_text(soup, 'Otras características'),
            'detalles_adicionales': self.try_extract_text(soup, 'Detalles adicionales'),
        }
        return {'generic_info': data}

# Uso de la clase Scrapper
url = 'https://www.amazon.com.mx/dp/1101946342/ref=s9_acsd_al_bw_c2_x_0_i?pf_rd_m=AVDBXBAVVSXLQ&pf_rd_s=merchandised-search-3&pf_rd_r=N2WAGPCYN2PNK97YR29N&pf_rd_t=101&pf_rd_p=53338ce2-58c7-4c35-a600-c8e773b2c1c4&pf_rd_i=11714118011'
objeto = Scrapper(url)
objeto.scrape()

Tests Performed:

  • Conducted manual tests with various product URLs to verify the accuracy of classification and data extraction.
  • Compared results with the previous version of the scraper to ensure consistency and improved data quality.

Additional Notes:

  • Ensured compliance with Python style conventions and best practices.
  • Verified compatibility with the latest versions of dependencies.
  • Included new files and resources required for the changes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants