diff --git a/web-scraping/author.py b/web-scraping/author.py index c124fa6..0560385 100644 --- a/web-scraping/author.py +++ b/web-scraping/author.py @@ -19,7 +19,7 @@ def get_author_id(self): def _set_author_id(self): author_id = self.db_uploader.select_articles_authors(self.name) - print author_id + #print author_id if author_id is None: return self.db_uploader.insert_articles_authors(self.name) diff --git a/web-scraping/db_uploader.py b/web-scraping/db_uploader.py index ddb4156..efa4bf1 100644 --- a/web-scraping/db_uploader.py +++ b/web-scraping/db_uploader.py @@ -19,11 +19,18 @@ def insert_articles_articles(self, content): return articles_id + def select_articles_articles_by_url(self, url): + with self.conn.cursor() as cur: + sql = "SELECT id from articles_articles where url = '{0}'".format(url) + cur.execute(sql) + res = cur.fetchone() + return res + def select_articles_authors(self, name): with self.conn.cursor() as cur: name_hash = self.name_hash(name) sql = "SELECT id FROM articles_authors where name_hash = '{0}'".format(name_hash) - print sql + #print sql cur.execute(sql) return cur.fetchone() diff --git a/web-scraping/reuters_html2content.py b/web-scraping/reuters_html2content.py index 347cd41..1e1a983 100644 --- a/web-scraping/reuters_html2content.py +++ b/web-scraping/reuters_html2content.py @@ -15,7 +15,13 @@ def __init__(self, conn): self.conn = conn def parse(self, full_url, soup): - author = Author(self.get_author_name(soup), self.conn) + author_name = self.get_author_name(soup) + if author_name is None: + return None + author = Author(author_name, self.conn) + author_id = author.get_author_id() + if author_id is None: + return None return Content( author.get_author_id(), self.get_article_text(soup), @@ -35,7 +41,7 @@ def get_article_text(self, soup): try: return "\n".join(map(lambda x: x.text, article_text.find_all("p"))) except AttributeError: - print article_text + #print article_text return article_text def get_revision_date(self, soup): @@ -59,5 +65,5 @@ def parse_time(self, tstr): html2content = ReutersHtml2Content() url = "http://jp.reuters.com/article/idJP2017020301002019?sp=true" soup = scraper.get_sorp(url); - print html2content.parse(url, soup) + #print html2content.parse(url, soup) diff --git a/web-scraping/reuters_jp_columns_app.py b/web-scraping/reuters_jp_columns_app.py index f88556a..bcbeddf 100644 --- a/web-scraping/reuters_jp_columns_app.py +++ b/web-scraping/reuters_jp_columns_app.py @@ -19,5 +19,8 @@ full_url = ReutersJpColummsScraper.get_full_url(url) soup = scraper.get_soup(full_url) content = html2content.parse(full_url, soup) - writer.write_articles_file(content) - page += 1 + if content: + writer.write_articles_file(content) + else: + print("content registration error") + scraper.load_more_content() diff --git a/web-scraping/reuters_jp_columns_scraper.py b/web-scraping/reuters_jp_columns_scraper.py index 25aafc6..0baa1c7 100644 --- a/web-scraping/reuters_jp_columns_scraper.py +++ b/web-scraping/reuters_jp_columns_scraper.py @@ -16,7 +16,7 @@ def __init__(self, log_path): def get_target_url(self): url = self.BASE_URL.format(self.page) - print url + #print url return url def get_soup(self, url): @@ -38,4 +38,4 @@ def get_full_url(cls, article_path): ## test if __name__ == '__main__': scraper = ReutersJpColummsScraper(ReutersJpColummsScraper.LOG_PATH) - print scraper.get_url_list() + #print scraper.get_url_list() diff --git a/web-scraping/reuters_news_app.py b/web-scraping/reuters_news_app.py index edbd913..21d51b8 100644 --- a/web-scraping/reuters_news_app.py +++ b/web-scraping/reuters_news_app.py @@ -22,4 +22,4 @@ content = html2content.parse(full_url, soup) #writer.replace_author(content) writer.write_articles_file(content) - scraper.load_more_content() + scraper.load_more_content() diff --git a/web-scraping/reuters_the_wire_scraper.py b/web-scraping/reuters_the_wire_scraper.py index 42c4dcc..ba84a3e 100644 --- a/web-scraping/reuters_the_wire_scraper.py +++ b/web-scraping/reuters_the_wire_scraper.py @@ -20,7 +20,7 @@ def get_sorp(self): def get_url_list(self): href_list = self.get_sorp().find_all("a", href=self.RE_ARTICLE) - url_list = map(lambda x: x.get("href"), href_list) + url_list = list(map(lambda x: x.get("href"), href_list)) if(len(url_list) > 20): del url_list[0:19] return url_list @@ -37,5 +37,5 @@ def get_full_url(cls, article_path): scraper = ReutersTheWireScraper(ReutersTheWireScraper.LOG_PATH) scraper.load_more_content() scraper.load_more_content() - print scraper.get_url_list() - print len(scraper.get_url_list()) + #print scraper.get_url_list() + #print len(scraper.get_url_list()) diff --git a/web-scraping/scraping_lib.py b/web-scraping/scraping_lib.py index 931a9c6..4d17569 100644 --- a/web-scraping/scraping_lib.py +++ b/web-scraping/scraping_lib.py @@ -45,7 +45,7 @@ def get_latest_sorp(self): return self.create_soup(markup) def get_markup_by_driver(self, url): - print url + #print url self.driver.get(url) source = self.driver.page_source return source.encode("utf-8") @@ -69,4 +69,4 @@ class ScrapingLibException(BaseException): ## test if __name__ == '__main__': scraper = ScrapingLib() - print scraper.get_sorp("http://sumodb.sumogames.de/Results_text.aspx?b=201509&d=9") + #print scraper.get_sorp("http://sumodb.sumogames.de/Results_text.aspx?b=201509&d=9") diff --git a/web-scraping/toyokeizai_app.py b/web-scraping/toyokeizai_app.py index 6362b4b..b315140 100644 --- a/web-scraping/toyokeizai_app.py +++ b/web-scraping/toyokeizai_app.py @@ -24,4 +24,4 @@ content = html2content.parse(full_url, soup) #writer.replace_author(content) writer.write_articles_file(content) - page += 1 + scraper.load_more_content() #page += 1 diff --git a/web-scraping/writer.py b/web-scraping/writer.py index 728d03c..2b7be1c 100644 --- a/web-scraping/writer.py +++ b/web-scraping/writer.py @@ -21,6 +21,9 @@ def __init__(self, conn): self.uploader = DbUploader(conn) def write_articles_file(self, content): + content_id = self.uploader.select_articles_articles_by_url(content.url) + if content_id is not None: + return False articles_id = self.uploader.insert_articles_articles(content) if articles_id is None: return False