-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess-irs-files.R
More file actions
94 lines (71 loc) · 2.22 KB
/
process-irs-files.R
File metadata and controls
94 lines (71 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#' ---
#' title: "process-irs-files.R"
#' author: "Chris Busch cbusch002@regis.edu"
#' date: "2017"
#' ---
#
#'
#'# Load libraries
#'
library(stringr)
library(psych)
setwd("~/../practicum2")
source("common.R")
#'
#'# Process IRS data files
#'
for(f in Sys.glob('data/irs/1*.csv')){
message('processing ',f)
fn=2000+as.numeric(str_match(f, '\\/(\\d+)')[,2])
d=read.csv(f,stringsAsFactors = F)
dt=read.csv(f,stringsAsFactors = F,colClasses = 'character')
d$COUNTYFIPS=dt$COUNTYFIPS ##repair
d$STATEFIPS=dt$STATEFIPS ##repair
d$fips=paste0(d$STATEFIPS,d$COUNTYFIPS)
rm(dt)
str(d)
d2=d[,c("STATE" , "COUNTYNAME" ,'fips' )]
d2$Year=fn
d2$num.returns=d$N1
d2$married.pct=d$MARS2/d$N1
d2$dependents.ratio=d$NUMDEP/d$N1
d2$adjusted.gross.income.avg=d$A00100/d$N1
d2$wages.avg=d$A00200/d$N1
d2$farming.ratio=d$SCHF/d$N1
d2$unemployed.ratio=d$N02300/d$N1
d2$dividends.ratio=d$N00600/d$N1
d2$business.ratio=d$N00900/d$N1
d2$realestate.ratio=d$N18500/d$N1 #indicator of ownership
d2$mortgage.ratio=d$N19300/d$N1 #indicator of ownership
d2$contributions.ratio=d$A19700/d$N1 #indicator of giving?
d2$taxcredits.ratio = d$N07100/d$N1
write.csv(d2,paste0("data/irsclean/",fn,"-irs-soi.csv"),row.names = F)
#summary(d2)
#describe(d2)
}
#' Noticed a lot of skew
summary(d2)
hist(d2$unemployed.ratio)
plot(density(asinh(d2$contributions.ratio)))
#' May need to transform or winsor data
#' With outliers:
moments::skewness(d2[,-1*(1:5)])
#' Without outliers via winsoring:
moments::skewness(winsor(d2[,-1*(1:5)]))
#' De-leveraged outliers via transformation:
moments::skewness(apply(d2[,-1*(1:5)],2,asinh))
library(dplyr)
library(choroplethr)
library(choroplethrMaps)
#' Helpful text: https://www.gislounge.com/mapping-county-demographic-data-in-r/
#'
#'# County Plots of Derived Data
#'
for(n in names(d2[,-1*(1:5)])){
print(county_choropleth(title=paste0(' ',n),data.frame(region=as.numeric(d2$fips),
value=d2[[n]])))
}
#' Another option for county maps:
#' https://stackoverflow.com/questions/25875877/remove-border-lines-in-ggplot-map-choropleth
#' https://www.arilamstein.com/blog/2015/07/02/exploring-the-demographics-of-ferguson-missouri/
# end of file