From db46482c07aed07aeb9aeccd78d5d2b259ab3ffb Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 00:31:53 +0900 Subject: [PATCH 01/11] comment out print() --- web-scraping/author.py | 2 +- web-scraping/db_uploader.py | 2 +- web-scraping/reuters_html2content.py | 4 ++-- web-scraping/reuters_jp_columns_scraper.py | 4 ++-- web-scraping/scraping_lib.py | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/web-scraping/author.py b/web-scraping/author.py index c124fa6..0560385 100644 --- a/web-scraping/author.py +++ b/web-scraping/author.py @@ -19,7 +19,7 @@ def get_author_id(self): def _set_author_id(self): author_id = self.db_uploader.select_articles_authors(self.name) - print author_id + #print author_id if author_id is None: return self.db_uploader.insert_articles_authors(self.name) diff --git a/web-scraping/db_uploader.py b/web-scraping/db_uploader.py index ddb4156..3b3f1f2 100644 --- a/web-scraping/db_uploader.py +++ b/web-scraping/db_uploader.py @@ -23,7 +23,7 @@ def select_articles_authors(self, name): with self.conn.cursor() as cur: name_hash = self.name_hash(name) sql = "SELECT id FROM articles_authors where name_hash = '{0}'".format(name_hash) - print sql + #print sql cur.execute(sql) return cur.fetchone() diff --git a/web-scraping/reuters_html2content.py b/web-scraping/reuters_html2content.py index 347cd41..7c0db46 100644 --- a/web-scraping/reuters_html2content.py +++ b/web-scraping/reuters_html2content.py @@ -35,7 +35,7 @@ def get_article_text(self, soup): try: return "\n".join(map(lambda x: x.text, article_text.find_all("p"))) except AttributeError: - print article_text + #print article_text return article_text def get_revision_date(self, soup): @@ -59,5 +59,5 @@ def parse_time(self, tstr): html2content = ReutersHtml2Content() url = "http://jp.reuters.com/article/idJP2017020301002019?sp=true" soup = scraper.get_sorp(url); - print html2content.parse(url, soup) + #print html2content.parse(url, soup) diff --git a/web-scraping/reuters_jp_columns_scraper.py b/web-scraping/reuters_jp_columns_scraper.py index 25aafc6..0baa1c7 100644 --- a/web-scraping/reuters_jp_columns_scraper.py +++ b/web-scraping/reuters_jp_columns_scraper.py @@ -16,7 +16,7 @@ def __init__(self, log_path): def get_target_url(self): url = self.BASE_URL.format(self.page) - print url + #print url return url def get_soup(self, url): @@ -38,4 +38,4 @@ def get_full_url(cls, article_path): ## test if __name__ == '__main__': scraper = ReutersJpColummsScraper(ReutersJpColummsScraper.LOG_PATH) - print scraper.get_url_list() + #print scraper.get_url_list() diff --git a/web-scraping/scraping_lib.py b/web-scraping/scraping_lib.py index 931a9c6..4d17569 100644 --- a/web-scraping/scraping_lib.py +++ b/web-scraping/scraping_lib.py @@ -45,7 +45,7 @@ def get_latest_sorp(self): return self.create_soup(markup) def get_markup_by_driver(self, url): - print url + #print url self.driver.get(url) source = self.driver.page_source return source.encode("utf-8") @@ -69,4 +69,4 @@ class ScrapingLibException(BaseException): ## test if __name__ == '__main__': scraper = ScrapingLib() - print scraper.get_sorp("http://sumodb.sumogames.de/Results_text.aspx?b=201509&d=9") + #print scraper.get_sorp("http://sumodb.sumogames.de/Results_text.aspx?b=201509&d=9") From 8d1d044740ffddc068773d065578e5bdb62b99f1 Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 02:55:38 +0900 Subject: [PATCH 02/11] add url-based query --- web-scraping/writer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/web-scraping/writer.py b/web-scraping/writer.py index 728d03c..2b7be1c 100644 --- a/web-scraping/writer.py +++ b/web-scraping/writer.py @@ -21,6 +21,9 @@ def __init__(self, conn): self.uploader = DbUploader(conn) def write_articles_file(self, content): + content_id = self.uploader.select_articles_articles_by_url(content.url) + if content_id is not None: + return False articles_id = self.uploader.insert_articles_articles(content) if articles_id is None: return False From 17b9408f204773eab1cacc8eadb6db5dd2b3308c Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 02:56:18 +0900 Subject: [PATCH 03/11] add url-based query --- web-scraping/db_uploader.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/web-scraping/db_uploader.py b/web-scraping/db_uploader.py index 3b3f1f2..0879032 100644 --- a/web-scraping/db_uploader.py +++ b/web-scraping/db_uploader.py @@ -15,10 +15,18 @@ def insert_articles_articles(self, content): sql = "INSERT INTO articles_articles(pub_date, url, author_id, title) VALUES (%s, %s, %s, %s)" cur.execute(sql, (content.pub_date, content.url, content.author_id, content.title)) articles_id = cur.lastrowid + print ("article id={0}, author id={1} url={2}".format(articles_id, content.author_id, content.url)) self.conn.commit() return articles_id + def select_articles_articles_by_url(self, url): + with self.conn.cursor() as cur: + sql = "SELECT id from articles_articles where url = '{0}'".format(url) + cur.execute(sql) + res = cur.fetchone() + return res + def select_articles_authors(self, name): with self.conn.cursor() as cur: name_hash = self.name_hash(name) From 669502896dc41bf9a90b40ff16c51d3fa8c53db7 Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 02:59:29 +0900 Subject: [PATCH 04/11] check parse result --- web-scraping/reuters_jp_columns_app.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/web-scraping/reuters_jp_columns_app.py b/web-scraping/reuters_jp_columns_app.py index f88556a..23a0737 100644 --- a/web-scraping/reuters_jp_columns_app.py +++ b/web-scraping/reuters_jp_columns_app.py @@ -19,5 +19,8 @@ full_url = ReutersJpColummsScraper.get_full_url(url) soup = scraper.get_soup(full_url) content = html2content.parse(full_url, soup) - writer.write_articles_file(content) - page += 1 + if content: + writer.write_articles_file(content) + else: + print("content registration error") + page += 1 From ee31b522cdc9a7f06ca82e6fafb64090ce58f033 Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 03:00:20 +0900 Subject: [PATCH 05/11] remove comment --- web-scraping/db_uploader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/web-scraping/db_uploader.py b/web-scraping/db_uploader.py index 0879032..efa4bf1 100644 --- a/web-scraping/db_uploader.py +++ b/web-scraping/db_uploader.py @@ -15,7 +15,6 @@ def insert_articles_articles(self, content): sql = "INSERT INTO articles_articles(pub_date, url, author_id, title) VALUES (%s, %s, %s, %s)" cur.execute(sql, (content.pub_date, content.url, content.author_id, content.title)) articles_id = cur.lastrowid - print ("article id={0}, author id={1} url={2}".format(articles_id, content.author_id, content.url)) self.conn.commit() return articles_id From 644c8bb427958101992a825d56bead637ff0ac42 Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 03:13:06 +0900 Subject: [PATCH 06/11] change url_list type of map to list --- web-scraping/reuters_the_wire_scraper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/web-scraping/reuters_the_wire_scraper.py b/web-scraping/reuters_the_wire_scraper.py index 42c4dcc..ba84a3e 100644 --- a/web-scraping/reuters_the_wire_scraper.py +++ b/web-scraping/reuters_the_wire_scraper.py @@ -20,7 +20,7 @@ def get_sorp(self): def get_url_list(self): href_list = self.get_sorp().find_all("a", href=self.RE_ARTICLE) - url_list = map(lambda x: x.get("href"), href_list) + url_list = list(map(lambda x: x.get("href"), href_list)) if(len(url_list) > 20): del url_list[0:19] return url_list @@ -37,5 +37,5 @@ def get_full_url(cls, article_path): scraper = ReutersTheWireScraper(ReutersTheWireScraper.LOG_PATH) scraper.load_more_content() scraper.load_more_content() - print scraper.get_url_list() - print len(scraper.get_url_list()) + #print scraper.get_url_list() + #print len(scraper.get_url_list()) From 469ad3af61960739d20fe74d2f8b3538120d0dfc Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 03:18:13 +0900 Subject: [PATCH 07/11] do not add content whose author is not found. --- web-scraping/reuters_html2content.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/web-scraping/reuters_html2content.py b/web-scraping/reuters_html2content.py index 7c0db46..1e1a983 100644 --- a/web-scraping/reuters_html2content.py +++ b/web-scraping/reuters_html2content.py @@ -15,7 +15,13 @@ def __init__(self, conn): self.conn = conn def parse(self, full_url, soup): - author = Author(self.get_author_name(soup), self.conn) + author_name = self.get_author_name(soup) + if author_name is None: + return None + author = Author(author_name, self.conn) + author_id = author.get_author_id() + if author_id is None: + return None return Content( author.get_author_id(), self.get_article_text(soup), From 0221a0b03b41837d69cefa7dade69fcf13ec4b1c Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 03:30:37 +0900 Subject: [PATCH 08/11] :bug: fix load_more_content --- web-scraping/reuters_jp_columns_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web-scraping/reuters_jp_columns_app.py b/web-scraping/reuters_jp_columns_app.py index 23a0737..914f104 100644 --- a/web-scraping/reuters_jp_columns_app.py +++ b/web-scraping/reuters_jp_columns_app.py @@ -23,4 +23,4 @@ writer.write_articles_file(content) else: print("content registration error") - page += 1 + scraper.load_more_content() From a1aff5c8acf58bd9df080410eaed05f27b721dd1 Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 03:38:22 +0900 Subject: [PATCH 09/11] :bug: fix indent --- web-scraping/reuters_jp_columns_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web-scraping/reuters_jp_columns_app.py b/web-scraping/reuters_jp_columns_app.py index 914f104..bcbeddf 100644 --- a/web-scraping/reuters_jp_columns_app.py +++ b/web-scraping/reuters_jp_columns_app.py @@ -23,4 +23,4 @@ writer.write_articles_file(content) else: print("content registration error") - scraper.load_more_content() + scraper.load_more_content() From 8c387b416d77ff1b9985beadbfa0f6480e8783f0 Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 03:54:41 +0900 Subject: [PATCH 10/11] :bug: use load_more_content --- web-scraping/toyokeizai_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web-scraping/toyokeizai_app.py b/web-scraping/toyokeizai_app.py index 6362b4b..b315140 100644 --- a/web-scraping/toyokeizai_app.py +++ b/web-scraping/toyokeizai_app.py @@ -24,4 +24,4 @@ content = html2content.parse(full_url, soup) #writer.replace_author(content) writer.write_articles_file(content) - page += 1 + scraper.load_more_content() #page += 1 From 672334be112869aaa6311f8af7038a34eef3d699 Mon Sep 17 00:00:00 2001 From: y2squared Date: Sun, 5 Feb 2017 03:55:40 +0900 Subject: [PATCH 11/11] :bug: fix indent of load_more_content --- web-scraping/reuters_news_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web-scraping/reuters_news_app.py b/web-scraping/reuters_news_app.py index edbd913..21d51b8 100644 --- a/web-scraping/reuters_news_app.py +++ b/web-scraping/reuters_news_app.py @@ -22,4 +22,4 @@ content = html2content.parse(full_url, soup) #writer.replace_author(content) writer.write_articles_file(content) - scraper.load_more_content() + scraper.load_more_content()