Skip to content

Commit d82723c

Browse files
committed
Rework web scraping example.
1 parent 12bf6ec commit d82723c

File tree

1 file changed

+83
-76
lines changed

1 file changed

+83
-76
lines changed

notebooks/2b_retrieval1.ipynb

Lines changed: 83 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
"1. Extract the data from the pages.\n",
4848
"1. Clean and save the resulting data.\n",
4949
"\n",
50-
"Let's walk through an example of getting press releases from the [Microsoft website](https://news.microsoft.com/category/press-releases/).\n",
50+
"Let's walk through an example of getting press releases from the [Alphabet website](https://abc.xyz/investor/news/2024/).\n",
5151
"\n",
5252
"I often prefer to work out of order as follows:\n",
5353
"\n",
@@ -91,14 +91,14 @@
9191
"metadata": {},
9292
"outputs": [],
9393
"source": [
94-
"_AGENT = \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0\"\n",
95-
"\n",
96-
"pr_url_1 = (\n",
97-
" \"https://news.microsoft.com/2018/10/04/\"\n",
98-
" \"redline-communications-and-microsoft-announce-\"\n",
99-
" \"partnership-to-lower-the-cost-of-tv-white-space-solutions/\"\n",
94+
"AGENT = (\n",
95+
" \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\"\n",
96+
" \" (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.3\"\n",
10097
")\n",
101-
"pr_req_1 = requests.get(pr_url_1, headers={\"User-Agent\": _AGENT})"
98+
"\n",
99+
"pr_url_1 = \"https://abc.xyz/2024-1010/\"\n",
100+
"\n",
101+
"pr_req_1 = requests.get(pr_url_1, headers={\"User-Agent\": AGENT})"
102102
]
103103
},
104104
{
@@ -111,6 +111,54 @@
111111
"pr_req_1.status_code"
112112
]
113113
},
114+
{
115+
"cell_type": "markdown",
116+
"metadata": {},
117+
"source": [
118+
"### Encoding\n",
119+
"\n",
120+
"This is a very deep topic that we only need to barely touch.\n",
121+
"In short, there are many standards for representing text as mappings of bytes (eight 0 or 1 values).\n",
122+
"Many of them have significant overlap (based on underlying standards that they are a superset of), such that they at least mostly work, but it's better if we're sure we're using the right encoding.\n",
123+
"\n",
124+
"In our example here, the server sends data in such a way that we would infer that the text is in the `ISO-8859-1` encoding, though it is actually in the `UTF-8` encoding.\n",
125+
"Fortunately, `requests` can tell us both what the encoding is and what it thinks it actually is, so we can build upon that."
126+
]
127+
},
128+
{
129+
"cell_type": "code",
130+
"execution_count": null,
131+
"metadata": {},
132+
"outputs": [],
133+
"source": [
134+
"pr_req_1.encoding"
135+
]
136+
},
137+
{
138+
"cell_type": "code",
139+
"execution_count": null,
140+
"metadata": {},
141+
"outputs": [],
142+
"source": [
143+
"pr_req_1.apparent_encoding"
144+
]
145+
},
146+
{
147+
"cell_type": "code",
148+
"execution_count": null,
149+
"metadata": {},
150+
"outputs": [],
151+
"source": [
152+
"pr_req_1.encoding = pr_req_1.apparent_encoding"
153+
]
154+
},
155+
{
156+
"cell_type": "markdown",
157+
"metadata": {},
158+
"source": [
159+
"### Extracting content"
160+
]
161+
},
114162
{
115163
"cell_type": "code",
116164
"execution_count": null,
@@ -208,36 +256,7 @@
208256
"metadata": {},
209257
"outputs": [],
210258
"source": [
211-
"pr_soup_1.find(\"div\", {\"class\": \"entry-content m-blog-content\"}).find(\"h3\").text"
212-
]
213-
},
214-
{
215-
"cell_type": "code",
216-
"execution_count": null,
217-
"metadata": {},
218-
"outputs": [],
219-
"source": [
220-
"pr_data_1[\"h3\"] = (\n",
221-
" pr_soup_1.find(\"div\", {\"class\": \"entry-content m-blog-content\"}).find(\"h3\").text\n",
222-
")"
223-
]
224-
},
225-
{
226-
"cell_type": "code",
227-
"execution_count": null,
228-
"metadata": {},
229-
"outputs": [],
230-
"source": [
231-
"pr_data_1"
232-
]
233-
},
234-
{
235-
"cell_type": "code",
236-
"execution_count": null,
237-
"metadata": {},
238-
"outputs": [],
239-
"source": [
240-
"pr_soup_1.find(\"div\", {\"class\": \"entry-content m-blog-content\"}).find_all(\"p\")"
259+
"pr_soup_1.find(\"div\", {\"class\": \"RichTextArticleBody RichTextBody\"}).find_all(\"p\")"
241260
]
242261
},
243262
{
@@ -251,7 +270,7 @@
251270
" [\n",
252271
" i.text\n",
253272
" for i in pr_soup_1.find(\n",
254-
" \"div\", {\"class\": \"entry-content m-blog-content\"}\n",
273+
" \"div\", {\"class\": \"RichTextArticleBody RichTextBody\"}\n",
255274
" ).find_all(\"p\")\n",
256275
" ]\n",
257276
")"
@@ -288,26 +307,19 @@
288307
"def get_data_from_soup(soup):\n",
289308
" data = {}\n",
290309
" for meta in _METAS:\n",
291-
" if soup.find(\"meta\", property=meta) is not None:\n",
310+
" try:\n",
292311
" prop = soup.find(\"meta\", property=meta)[\"property\"]\n",
293-
" if soup.find(\"meta\", property=meta) is not None:\n",
294312
" content = soup.find(\"meta\", property=meta)[\"content\"]\n",
295-
" if prop is not None and content is not None:\n",
296-
" data.update({prop: content})\n",
297-
" try:\n",
298-
" data[\"h3\"] = (\n",
299-
" soup.find(\"div\", {\"class\": \"entry-content m-blog-content\"})\n",
300-
" .find(\"h3\")\n",
301-
" .string\n",
302-
" )\n",
303-
" except AttributeError:\n",
304-
" data[\"h3\"] = \"\"\n",
313+
" except TypeError:\n",
314+
" prop = meta\n",
315+
" content = \"\"\n",
316+
" data.update({prop: content})\n",
305317
"\n",
306318
" data[\"body\"] = \"\\n\\n\".join(\n",
307319
" [\n",
308320
" i.text\n",
309321
" for i in soup.find(\n",
310-
" \"div\", {\"class\": \"entry-content m-blog-content\"}\n",
322+
" \"div\", {\"class\": \"RichTextArticleBody RichTextBody\"}\n",
311323
" ).find_all(\"p\")\n",
312324
" ]\n",
313325
" )\n",
@@ -340,8 +352,8 @@
340352
"metadata": {},
341353
"outputs": [],
342354
"source": [
343-
"many_pr_url_1 = \"https://news.microsoft.com/category/press-releases/\"\n",
344-
"many_pr_page_1 = requests.get(many_pr_url_1, headers={\"User-Agent\": _AGENT}).text\n",
355+
"many_pr_url_1 = \"https://abc.xyz/investor/news/2024/\"\n",
356+
"many_pr_page_1 = requests.get(many_pr_url_1, headers={\"User-Agent\": AGENT}).text\n",
345357
"many_pr_soup_1 = BeautifulSoup(many_pr_page_1)"
346358
]
347359
},
@@ -351,8 +363,8 @@
351363
"metadata": {},
352364
"outputs": [],
353365
"source": [
354-
"# Almost, but note the ones at the bottom.\n",
355-
"many_pr_soup_1.find(\"section\", id=\"primary\").find_all(\"a\")"
366+
"# Here, we find the div containing the listings and then find the links within.\n",
367+
"many_pr_soup_1.find(\"div\", {\"class\": \"PageListW-items\"}).find_all(\"a\")"
356368
]
357369
},
358370
{
@@ -361,10 +373,9 @@
361373
"metadata": {},
362374
"outputs": [],
363375
"source": [
364-
"# Here, we further filter down to articles and then get their hrefs to\n",
365-
"# eliminate the navigation links at the bottom.\n",
366-
"articles = many_pr_soup_1.find(\"section\", id=\"primary\").find_all(\"article\")\n",
367-
"links = [i.find(\"a\")[\"href\"] for i in articles]\n",
376+
"# Then, for each of the anchor tags, we can extract the links themselves.\n",
377+
"articles = many_pr_soup_1.find(\"div\", {\"class\": \"PageListW-items\"}).find_all(\"a\")\n",
378+
"links = [i[\"href\"] for i in articles]\n",
368379
"links"
369380
]
370381
},
@@ -392,15 +403,17 @@
392403
"source": [
393404
"# We need to turn links into soup objects a lot, so let's make a function.\n",
394405
"def link_to_soup(link):\n",
395-
" page = requests.get(link, headers={\"User-Agent\": _AGENT}).text\n",
406+
" page_request = requests.get(link, headers={\"User-Agent\": AGENT})\n",
407+
" page_request.encoding = page_request.apparent_encoding\n",
408+
" page = page_request.text\n",
396409
" soup = BeautifulSoup(page)\n",
397410
" return soup\n",
398411
"\n",
399412
"\n",
400413
"def get_links_from_link_page(link_page):\n",
401414
" soup = link_to_soup(link_page)\n",
402-
" articles = soup.find(\"section\", id=\"primary\").find_all(\"article\")\n",
403-
" links = [i.find(\"a\")[\"href\"] for i in articles]\n",
415+
" articles = soup.find(\"div\", {\"class\": \"PageListW-items\"})\n",
416+
" links = [i[\"href\"] for i in articles]\n",
404417
" return links\n",
405418
"\n",
406419
"\n",
@@ -419,8 +432,8 @@
419432
"metadata": {},
420433
"outputs": [],
421434
"source": [
422-
"msft_prs = pd.DataFrame(get_data_from_links(many_pr_links_1))\n",
423-
"msft_prs.head()"
435+
"alphabet_prs = pd.DataFrame(get_data_from_links(many_pr_links_1))\n",
436+
"alphabet_prs.head()"
424437
]
425438
},
426439
{
@@ -429,18 +442,12 @@
429442
"source": [
430443
"# Further automation\n",
431444
"\n",
432-
"**Note**: for running time reasons, we're not going to make a multi-links-page version, but note that there's a next page link at the bottom of those pages that can be extracted to build that:\n",
433-
"\n",
434-
"```html\n",
435-
"<a href=\"/category/press-releases/page/2/?paged=3\" \n",
436-
" class=\"c-glyph x-hidden-focus\" \n",
437-
" aria-label=\"Go to next page\" ms.title=\"Next Page\">\n",
438-
"```\n",
445+
"**Note**: for running time reasons, we're not going to make a multi-links-page version, but note that there are year links on the left of the listing pages that can be extracted.\n",
439446
"\n",
440-
"However, we could also notice that the link pages have a number in the URL that is incremented by one for each page.\n",
441-
"We would have to look at a page to get the end number, but we could also simply use a loop to construct a URL for each of those numbers.\n",
447+
"However, we could also notice that the link pages have a year in the URL.\n",
448+
"We would have to look at a page to get the earliest year, but we could otherwise simply use a loop to construct a URL for each of those years.\n",
442449
"\n",
443-
"`https://news.microsoft.com/category/press-releases/page/2/`"
450+
"`https://abc.xyz/investor/news/2023/`"
444451
]
445452
}
446453
],
@@ -460,7 +467,7 @@
460467
"name": "python",
461468
"nbconvert_exporter": "python",
462469
"pygments_lexer": "ipython3",
463-
"version": "3.11.4"
470+
"version": "3.11.11"
464471
},
465472
"vscode": {
466473
"interpreter": {

0 commit comments

Comments
 (0)