-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathScraper.py
More file actions
274 lines (213 loc) · 13.7 KB
/
Scraper.py
File metadata and controls
274 lines (213 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import requests
import json
from classes.Opening import *
from re import sub
from bs4 import BeautifulSoup
from os import mkdir, path, listdir
from nltk.corpus import stopwords
from shutil import copyfileobj
''' class Scraper
DESCRIPTION:
The Scraper class performs all operations for scraping data from various sources. The methods
are separated to provide some flexibility on operations.
ATTRIBUTES:
index (dict[str,dict[str,int]) - the index created when scrape_descriptions() is called
- the index saves in the file at Paths.INDEX_JSON
- the form of the index is [key: val] == [term: {opening_code:term_freq}]
- example:
{
'center': {
'A00': 4,
'B32': 15,
...
},
...
}
openings_dict (OpeningsDict) - an OpeningsDict object to store the openings associated with this Scraper
- can be created using any of the generative methods in OpeningsDict, i.e. from_json(), from_list(), etc.
num_terms (dict[str,int]) - keeps track of the total number of terms for each opening (denoted by the code)
- combines the number of terms from ALL sources of descriptions
- saves in Paths.NUM_TERMS_JSON
- in the form [key: val] == [opening_code: num_terms]
- example:
{
'A00': 456,
'B32': 942,
...
}
NOTE: see each method description for in-depth details on how each method is used and the parameters
associated with each function.
'''
class Scraper:
# STATIC ATTRIBUTES
headers:dict[str,str] = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
chess_com_url:str = "https://www.chess.com/callback/eco/advanced-search?keyword=&useFavorites=false&page="
stopwords = stopwords.words('english')
# The divs containing the content of interest for each site we scrape
# NOTE: the keys are tuples with (attribute_type, attribute_name) where attribute_type is 0 (class) or 1 (id)
divs:dict[str,str] = {
'chess.com':(0,'post-view-content'),
'wikipedia':(1,'mw-content-text')
}
# DYNAMIC ATTRIBUTES
index:dict[str,dict[str,int]]
openings_dict:OpeningsDict
num_terms:dict[str,int]
def __init__(self, auto_load_data:bool=False):
if auto_load_data:
self.index = json.load(open(Paths.INDEX_JSON))
self.openings_dict = OpeningsDict.from_json()
self.num_terms = json.load(open(Paths.NUM_TERMS_JSON))
else:
self.index = {}
self.openings_dict = OpeningsDict()
self.num_terms = {}
''' get_openings() - get the opening names and links from chess.com and wikipedia and save the results in Paths.OPENINGS_JSON
NOTE: initially scrapes openings off chess.com (120, including variations) and creates wikipedia links off these
- does NOT pull opening descriptions, rather just pulls the opening names and general info from chess.com
- does NOT separate variations from generic openings
'''
def get_openings(self, print_debug:bool=True, auto_save:bool=True) -> OpeningsDict:
all_openings:list[Opening] = [] # List of all openings once done
# Chess.com's opening page has 5 pages (numbered 1-5, inclusive) - collect data from each one by one and appent to all_openings as we go
# NOTE: range(1,6) since range() is inclusive of lower bound & exclusive of upper bound
for i in range(1,6):
url = Scraper.chess_com_url + str(i) # Format the chess.com url endpoint for this page
response = requests.get(url, headers=Scraper.headers) # Make the request
openings = response.json()['items'] # Get the 'items' object from the response; this contains the openings
# Add these openings to all_openings as Opening objects
all_openings.extend([
Opening(
o['name'], # Name
o['code'], # Code
"", # Color
Opening.wiki_link_from_name(o['name']), # Wiki link
o['url'], # Chess.com URL
o['move_list'] # Move list (as str)
)
for o in openings]
)
# Create an openings dict obj to return and save if configured
self.openings_dict = OpeningsDict.from_list(all_openings)
# Persistent save to json
if auto_save: self.openings_dict.dump_json()
# Print info about results
if print_debug: print(f"Scraper: done getting openings.\nSaved {len(all_openings)} results to \"{Paths.OPENINGS_JSON}.\"")
# Return self.openings_dict
return self.openings_dict
''' scrape_descriptions(openings_dict) - scrape the openings' descriptions from Wikipedia and Chess.com
PARAMETERS:
openings_dict (OpeningsDict) - an openings dict object containing the openings to scrape for
auto_save_index (bool) - (optional) specify whether to automatically save the results in Paths.INDEX_JSON
- NOTE: will override whatever currently exists in the index json
NOTE: uses self.OpeningsDict to get the links and self.index for the index
'''
def scrape_descriptions(self, auto_save_index:bool=True, print_debug:bool=True) -> None:
# For tracking purposes
num_done:int=0
num_openings:int=len(self.openings_dict.openings)
mod_openings_dict:bool = False
if print_debug: print(f"Scraping {num_openings} openings...")
# Iterate over self.openings_dict.openings and visit all the URLs
for c,v in self.openings_dict.openings.items():
num_terms:int = 0 # Track the number of terms for this opening across all descriptions
# Create dir to save raw content if needed
if not path.exists(Paths.RAW_DESC_BASE + c): mkdir(Paths.RAW_DESC_BASE + c)
# Iterate over the links for this opening and get the content
for site,url in v['links'].items():
if not url: continue # If there is no url, skip; some do not have chess.com or wikipedia URLs
response = requests.get(url, headers=Scraper.headers) # Make the request
# Skip if there is an error code
if response.status_code != 200:
if print_debug: print(f"Error getting content for \n\tCode: {c}\n\tName: {v['opening-name']}\n\tURL: {url}")
continue
# Use BeautifulSoup to parse the HTML
soup:BeautifulSoup = BeautifulSoup(response.text, 'html.parser')
# Save the raw content
f = open(Paths.RAW_DESC_BASE + c + f"/{site}.txt", "w+", encoding="utf-8")
# Get the correct div that contains the content for this link
div_attr_type:int = Scraper.divs[site][0] # Get the div attribute of interest (class==0 or id==1)
div_attr_name:str = Scraper.divs[site][1] # Get the div attribute name (ex. 'body-content')
content:str = ''
''' try:
match(div_attr_type):
case 0: content = soup.find(class_=div_attr_name).text # Looking for class
case 1:
reflist_div = soup.find(class_='reflist')
reflist_div.extract()
sidebox_divs = soup.find_all(class_='side-box')
for d in sidebox_divs: d.extract()
navbox_div = soup.find(class_='navbox')
navbox_div.extract()
citation_divs = soup.find_all(class_='citation')
for d in citation_divs: d.extract()
mbox = soup.find_all(class_ = 'mbox-text-span')
for d in mbox: d.extract()
vec_body = soup.find_all(class_ = 'vector-body-before-content')
for d in vec_body: d.extract()
mw_headline = soup.find_all(class_ = 'mw-headline')
for d in mw_headline: d.extract()
navbox_title = soup.find_all(class_ = 'navbox-title')
for d in navbox_title: d.extract()
content = soup.find(id=div_attr_name).text # Looking for id
try:
f.write(content) # Write the content
except Exception as e:
if print_debug:
print(f"Error writing to file for description of {c} \n\tNAME: {v['opening-name']} \n\tSITE: {site}")
print(f"\tERROR: {e}")
f.close() # Close the file
except AttributeError:
if print_debug: print(f"Error getting content for \n\tCode: {c}\n\tName: {v['opening-name']}\n\tURL: {url}")
f.close()
continue
'''
# Tokenize the content
tokens:list[str] = Scraper.tokenize(soup.text)
# Parse the tokens and add to the index
for t in tokens:
try: # If the token exists in the index already
tmp = self.index[t] # Get the current dict of codes and term freqs at self.index[t]
if c in tmp: tmp[c] += 1 # If the current code (c) exists in the index for this token, add 1
else: tmp[c] = 1 # If the current code (c) does NOT exist in the index for this token, add it with term freq 1
self.index[t] = tmp # Replace self.index[t] with the updated dict of codes and term freqs
except KeyError: # If the token does NOT exist in the index already
self.index[t] = {c:1} # Add it with just this code (c) and 1 as the term freq
# Add the number of tokens to num_terms
num_terms += len(tokens)
self.num_terms[c] = num_terms # Save the final sum of the number of terms in self.num_terms
num_done+=1
if print_debug: print(f"Done scraping for \"{c}\", \"{v['opening-name']}\" ({num_done}/{num_openings})")
# If configured, save the index to Paths.INDEX_JSON and overwrite whatever exists
if auto_save_index:
json.dump(self.index, open(Paths.INDEX_JSON, 'w'), indent=4)
json.dump(self.num_terms, open(Paths.NUM_TERMS_JSON, 'w'), indent=4)
self.openings_dict.dump_json()
# Generate the concatenated files for all ECO descriptions
#self.__concat_all_descriptions__()
if print_debug: print(f"Scraper: Done scraping descriptions.\nNew index length = {len(self.index)}")
def __concat_all_descriptions__(self) -> None:
opening_elos = listdir(Paths.RAW_DESC_BASE)
for elo in opening_elos:
with open(Paths.RAW_DESC_BASE + "/" + elo + "/" + 'concat.txt','wb') as output:
for component in listdir(Paths.RAW_DESC_BASE + elo):
with open(Paths.RAW_DESC_BASE + elo + "/" + component,'rb') as component_to_read:
copyfileobj(component_to_read, output)
''' tokenize(text) - tokenize the given text in a standard way
PARAMETERS:
text (str) - the text to tokenize
RETURNS:
A list of strings representing the tokenized text
NOTE: treats spaces, special characters, and newlines as delimeters
'''
@staticmethod
def tokenize(text:str) -> list[str]:
tokens = []
# Replace any special chars in the content with spaces to act as delimeters
pattern:str = r'[^a-zA-Z0-9\s]' # Pattern of plaintext characters (a-z, A-Z, 0-9, no special chars)
text = sub(pattern, ' ', text) # Substitute all matches with spaces
text = sub(r'html\r\n', '', text) # Remove the html head
text = sub(r'\n', ' ', text) # Remove newlines
# Split the text on spaces, convert to lower, and strip whitespace from each token
tokens = [s.lower().strip() for s in text.split(' ') if not s.isspace() and s and not s.lower() in Scraper.stopwords]
return tokens