forked from Tundrasama/tax_protest
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprogram.py
More file actions
378 lines (326 loc) · 14.1 KB
/
program.py
File metadata and controls
378 lines (326 loc) · 14.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
"""
Midland Central Appraisal District Scraper v1.1
Operation:
• Prompted to enter a a single property or use the street/neighborhood lookup
Output:
Example:
___________
R000050656,2017,$218100 |
R000050656,2016,$203360 |
R000050656,2015,$203360 |--------- Change in valuation over last four years
R000050656,2014,$196080 | --- Property ID, Valuation Year, Valuation Amount
___________|
_____
R000050656,RESIDENCE,1962,2232 |
R000050656,GARAGE,1962,504 |
R000050656,STG BLDG,2012,160 |--- Types of improvements
R000050656,GLASS PORCH,1997,490 | --- date associated with RESIDENCE is the construction date
_____| --- Property ID, Improvement Type, Improvement Year, Improvement Sqft
R000050656,$3,713.82 ------------------ Total Assessed Tax for current year
--- Property ID, Tax Assessed
Analysis:
Use your preferred tool
(for) Version 1.2:
• Single Lookup doesn't pull raw data -- need to use part of the fetch_properties code
Version 1.1:
• Street Lookup - use 's:<street name>' to look up all the properties on that street
• Neighborhood Lookup - use 'n:<neighborhood name>' to look up all the properties in that neighborhood
• summary data will now be written to summary.txt
['property_id', 'geographic_id', 'owner_name', 'address', 'legal', 'market_value']
Version 1.0:
• Single property (by property code) or loop (from root csv) lookup
Limits:
• Most records that can be pulled at once appears to be 1024
• trying to bypass this and get every record in Midland
"""
import re
import requests
import urllib
from bs4 import BeautifulSoup
import csv
import numpy as np
valuation_headers = ['property_id', 'year', 'valuation']
improvements_headers = ['property_id', 'improvement', 'year', 'sqft']
taxes_headers = ['property_id', 'tax']
summary_headers = ['property_id', 'geographic_id', 'owner_name', 'address', 'legal', 'market_value']
base_url = "http://iswdataclient.azurewebsites.net/webProperty.aspx?dbkey=midlandcad&id="
def choice():
create_files()
property_id = input(
"Enter a Property ID, street name (s:<name>), or neighborhood name (n:<name>):")
if property_id != "":
if len(property_id.strip()) == 10 and property_id[0].strip().capitalize() == "R":
# print(neighborhood)
property_id = property_id.strip().capitalize()
print("Looking up property: {}".format(property_id))
try:
single_lookup(property_id)
except:
write_error(property_id, 'lookup error')
choice()
elif 'n:' in property_id.strip() or 's:' in property_id.strip():
fetch_properties(property_id)
elif property_id == 'all':
fetch_every_property()
else:
print("Unrecognized command")
quit()
def fetch_every_property():
# trying to pull every record in Midland, TX
base_id = "R"
for x in range(1, 250000):
property_id = x
property_id = format(property_id, "09d")
property_id = "{}{}".format(base_id, property_id)
single_lookup(property_id)
def fetch_properties(loc_type):
if loc_type[0] == 's':
stype = 'situs'
location = loc_type.replace("s:", "").replace(" ", "%20")
search_url = "http://iswdataclient.azurewebsites.net/webSearchAddress.aspx?dbkey=midlandcad&stype={}&sdata=".format(
stype)
search_url = "{}{}%7c0%7c".format(search_url, location) # addition necessary when looking up street
elif loc_type[0] == 'n':
stype = 'legal'
location = loc_type.replace("n:", "").replace(" ", "%20")
search_url = "http://iswdataclient.azurewebsites.net/webSearchAddress.aspx?dbkey=midlandcad&stype={}&sdata={}".format(
stype, location)
form_data = {
'ucSearchAddress_searchstreet': location,
}
data = urllib.parse.urlencode(form_data).encode("utf-8")
req = urllib.request.Request(search_url)
html_txt = urllib.request.urlopen(req, data=data)
bs = BeautifulSoup(html_txt, "lxml")
raw_data = create_raw_data(bs)
property_ids = []
property_data = []
property_summary_data = [[]]
for i, properties in enumerate(raw_data):
if len(properties) != 0:
if properties[0] == 'View Property':
# print(i, properties)
property_ids.append(properties[1])
for x in range(1, len(properties) - 1):
property_data.append(properties[x])
property_summary_data.append(property_data)
with open("summary.txt", 'a', newline='') as outfile:
outfile.write("{},{},{},{},{},{}\n".format(property_data[0],
property_data[1],
property_data[2],
property_data[3],
property_data[4],
property_data[5]))
property_data = []
# print(property_summary_data)
# print(len(property_ids))
if len(property_ids) == 0:
print("No properties found for '{}'.".format(location))
else:
print("Fetching information for properties...")
get_metrics(property_ids, property_summary_data)
# print("Properties: {}".format(property_ids))
def create_raw_data(soup):
trs = soup.findAll('tr')
property_info = [[]]
indv_prop = []
for i, tr in enumerate(trs):
if "R000" in tr.text:
for td in tr.findAll('td'):
if td.text != "":
indv_prop.append(td.text)
property_info.append(indv_prop)
indv_prop = []
return property_info
def single_lookup(property_id):
valuation = []
improvements = []
taxes = []
str_url = "{}{}".format(base_url, property_id)
html_txt = urllib.request.urlopen(str_url)
bs = BeautifulSoup(html_txt, "lxml")
try:
valuation = get_valuation(bs)
except:
write_error(property_id, 'valuation error')
try:
improvements = get_improvements(bs)
except:
write_error(property_id, 'improvements error')
try:
taxes = get_taxes(bs)
except:
write_error(property_id, 'taxes error')
try:
address = get_address(bs)
except:
write_error(property_id, 'address error')
entries = assemble_entries(valuation, improvements, taxes, property_id)
def get_metrics(properties, data):
valuation = []
improvements = []
taxes = []
for property_id in properties:
url = "{}{}".format(base_url, property_id)
html_txt = urllib.request.urlopen(url)
bs = BeautifulSoup(html_txt, "lxml")
for sublist in data:
if property_id in sublist:
print("------------------------------")
print("Address: {}".format(sublist[3]))
try:
valuation = get_valuation(bs)
except:
write_error(property_id, 'valuation error')
# get improvements information
try:
improvements = get_improvements(bs)
except:
write_error(property_id, 'improvements error')
# get taxes information
try:
taxes = get_taxes(bs)
except:
write_error(property_id, 'taxes error')
# Format for output
entries = assemble_entries(valuation, improvements, taxes, property_id)
# TODO: still used with single lookup, but need to incorporate address lookup from fetch_properties to single_lookup
def get_address(soup):
address = soup.find(lambda tag: tag.has_attr('id') and tag['id'] == "webprop_situs").get_text()
# print(address)
def get_valuation(soup):
# pulls html for the selected criteria, summary='Valuation Table'
valuation = soup.find(lambda tag: tag.has_attr('summary') and tag['summary'] == "Valuation Table").get_text()
# split the result into lines
valuation_lines = valuation.splitlines()
# header information is fixed
header = valuation_lines[5:9]
# assessed value is fixed
assessed = valuation_lines[97:101]
# use numpy to concatenate the two arrays vertically
valuation = np.vstack((header, assessed))
return valuation
def get_improvements(soup):
# setup lists (arrays)
# there are probably better ways to do this portion, but still learning BeautifulSoup
arr_improvements = []
arr_improvements_year = []
arr_improvements_sqft = []
# find the table with the information regarding improvements
improvements = soup.find(lambda tag: tag.has_attr('summary') and tag['summary'] == "Building Details")
# get the table rows
improvement_data = improvements.findAll("tr")
# if you want to loop and include an index without having to increment, you can use enumerate()
for i, item in enumerate(improvement_data):
# find the table cells -- always in the same format
for j, td in enumerate(item.findAll("td")):
# improvement type - text=True strips out all of the html elements
if j == 2:
arr_improvements.append(td.findAll(text=True))
# improvement year
elif j == 3:
arr_improvements_year.append(td.findAll(text=True))
# improvement sqft
elif j == 4:
arr_improvements_sqft.append(td.findAll(text=True))
# this step isn't really necessary, could probably be removed
improvements = arr_improvements
years = arr_improvements_year
sqft = arr_improvements_sqft
# use numpy to concatenate the 3 lists horizontally
improvements = np.hstack((improvements, years, sqft))
return improvements
def get_taxes(soup):
# setup tax list
tax_items = []
# find the associated table
taxes = soup.find(lambda tag: tag.has_attr('summary') and tag['summary'] == "Estimated Taxes")
# only need the summary row (which is always bold)
for strong_tag in taxes.findAll("strong"):
tax_items.append(strong_tag.text)
# just get the last value, which is the total assessed tax
return (tax_items[-1])
def assemble_entries(val, imprv, tax, property_id):
# the values were still being treated as numpy arrays, so I couldn't remove the commas in the
# variables that were currency, but creating a new variable allowed me to replace the commas,
# tried to create the variables dynamically in a loop, but it wasn't working properly
val1 = val[1][0]
val2 = val[1][1]
val3 = val[1][2]
val4 = val[1][3]
"""
for x in range(0, 4):
y = x
y += 1
exec("val{} = val[1,{}]".format(y, x))
"""
"""
there are always four lines, one for each year
should look like: R000050406,2017,$123930
R000050406,2016,$118910
R000050406,2015,$114220
R000050406,2014,$103840
"""
valuation_lines = "\n{},{},{}\n{},{},{}\n{},{},{}\n{},{},{}".format(property_id, val[0][0], val1.replace(",", ""),
property_id, val[0][1], val2.replace(",", ""),
property_id, val[0][2], val3.replace(",", ""),
property_id, val[0][3], val4.replace(",", ""))
# print(valuation_lines)
# write to file
with open("valuation.txt", 'a', newline='') as outfile:
outfile.write(valuation_lines)
# the number of improvements can vary, but .shape can tell me the dimensions of the array
# and allow me to iterate over it accordingly
"""
should look like:R000050407,RESIDENCE,1959,1612
R000050407,GARAGE,1959,483
R000050407,ADDITION,1959,360
"""
improvement_lines = []
x = 0
try:
while x < imprv.shape[0]:
sqft = imprv[x][2]
sqft = sqft.replace(",", "")
# presentation view
# improvement_lines.append("{} ({}) - {}sqft".format(imprv[x][0].capitalize(), imprv[x][1], sqft))
# csv view
improvement_lines.append("{},{},{},{}".format(property_id, imprv[x][0], imprv[x][1], sqft))
x += 1
improvement_lines = "\n".join(improvement_lines)
# print(improvement_lines)
# write to file
with open("improvements.txt", 'a', newline='') as outfile:
outfile.write('\n{}.'.format(improvement_lines))
except:
write_error(property_id, 'assembling improvements error')
# not much extra formatting required for taxes
"""
should look like: R000050407,$3,192.19
"""
# print("{},{}".format(property_id, tax))
with open("taxes.txt", 'a', newline='') as outfile:
outfile.write("\n{},{}".format(property_id, tax))
def create_files():
# create each txt file for output and input headers
with open('valuation.txt', 'w') as outfile:
for item in valuation_headers:
outfile.write("{},".format(item))
with open('improvements.txt', 'w') as outfile:
for item in improvements_headers:
outfile.write("{},".format(item))
with open('taxes.txt', 'w') as outfile:
for item in taxes_headers:
outfile.write("{},".format(item))
with open('summary.txt', 'w') as outfile:
for item in summary_headers:
outfile.write("{},".format(item))
with open('errors.txt', 'w') as outfile:
outfile.write("property_id,error")
def write_error(property_id, error_type):
with open('errors.txt', 'a', newline='') as outfile:
outfile.write("{}{}\n".format(property_id, error_type))
def main():
choice()
if __name__ == '__main__':
main()