From 185c3c4edcb8b83db4ba07d293d228fa57f5295c Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Fri, 7 Apr 2023 17:11:00 -0400 Subject: [PATCH 01/39] point to propublica metadata submodule --- .gitmodules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 2d13bc8..563a07c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,8 @@ [submodule "irs_reader/metadata"] path = irs_reader/metadata - url = https://github.com/jsfenfen/990-xml-metadata + url = https://github.com/propublica/990-xml-metadata branch = master [submodule "metadata"] path = metadata - url = https://github.com/jsfenfen/990-xml-metadata + url = https://github.com/propublica/990-xml-metadata branch = master From 05f56c52fdefb6d5c8be363b34709fd3af8ace04 Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Mon, 10 Apr 2023 17:59:41 -0400 Subject: [PATCH 02/39] update submodules again --- .gitmodules | 7 +++---- irs_reader/metadata | 2 +- metadata | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.gitmodules b/.gitmodules index 563a07c..43c4886 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,7 @@ [submodule "irs_reader/metadata"] path = irs_reader/metadata - url = https://github.com/propublica/990-xml-metadata - branch = master + url = git@github.com:propublica/990-xml-metadata.git [submodule "metadata"] path = metadata - url = https://github.com/propublica/990-xml-metadata - branch = master + url = git@github.com:propublica/990-xml-metadata.git + diff --git a/irs_reader/metadata b/irs_reader/metadata index 4ad69cc..44de89f 160000 --- a/irs_reader/metadata +++ b/irs_reader/metadata @@ -1 +1 @@ -Subproject commit 4ad69cc0f68dedb1137ccae34c4c84f88295b0a9 +Subproject commit 44de89f4686909065fbf0f7fa31dd550dfa4591a diff --git a/metadata b/metadata index 4ad69cc..44de89f 160000 --- a/metadata +++ b/metadata @@ -1 +1 @@ -Subproject commit 4ad69cc0f68dedb1137ccae34c4c84f88295b0a9 +Subproject commit 44de89f4686909065fbf0f7fa31dd550dfa4591a From 874b6f16d8617a48a91a5b433951dec6cf6fccfb Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Mon, 10 Apr 2023 18:03:51 -0400 Subject: [PATCH 03/39] update submodules again --- .gitmodules | 7 +++---- irs_reader/metadata | 2 +- metadata | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.gitmodules b/.gitmodules index 563a07c..43c4886 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,7 @@ [submodule "irs_reader/metadata"] path = irs_reader/metadata - url = https://github.com/propublica/990-xml-metadata - branch = master + url = git@github.com:propublica/990-xml-metadata.git [submodule "metadata"] path = metadata - url = https://github.com/propublica/990-xml-metadata - branch = master + url = git@github.com:propublica/990-xml-metadata.git + diff --git a/irs_reader/metadata b/irs_reader/metadata index 4ad69cc..44de89f 160000 --- a/irs_reader/metadata +++ b/irs_reader/metadata @@ -1 +1 @@ -Subproject commit 4ad69cc0f68dedb1137ccae34c4c84f88295b0a9 +Subproject commit 44de89f4686909065fbf0f7fa31dd550dfa4591a diff --git a/metadata b/metadata index 4ad69cc..e79032e 160000 --- a/metadata +++ b/metadata @@ -1 +1 @@ -Subproject commit 4ad69cc0f68dedb1137ccae34c4c84f88295b0a9 +Subproject commit e79032e3d2988603348c9b81eb291ba77919e4b0 From 1b69c7063cb3eed6721c84e501d5bf7c2df211a9 Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Tue, 11 Apr 2023 14:18:09 -0400 Subject: [PATCH 04/39] support 2021 versions --- irs_reader/settings.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/irs_reader/settings.py b/irs_reader/settings.py index 5a294ca..4045abc 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -36,12 +36,15 @@ # these could get pushed to metadata directory? ALLOWED_VERSIONSTRINGS = [ - '2013v3.0', '2013v3.1', '2013v4.0', '2014v5.0', '2014v6.0', - '2015v2.0', '2015v2.1', '2015v3.0', '2016v3.0', '2016v3.1', - '2017v2.0', '2017v2.1', '2017v2.2', '2017v2.3', '2018v3.0', - '2018v3.1', '2018v3.2', '2018v3.3', '2019v5.0', '2019v5.1', - '2019v5.2', '2020v1.0', '2020v2.0', '2020v3.0', '2020v4.0', - '2020v4.1', '2020v4.2', '2020v1.1' + '2013v3.0', '2013v3.1', '2013v4.0', + '2014v5.0', '2014v6.0', + '2015v2.0', '2015v2.1', '2015v3.0', + '2016v3.0', '2016v3.1', + '2017v2.0', '2017v2.1', '2017v2.2', '2017v2.3', + '2018v3.0', '2018v3.1', '2018v3.2', '2018v3.3', + '2019v5.0', '2019v5.1', '2019v5.2', + '2020v1.0', '2020v2.0', '2020v3.0', '2020v4.0','2020v4.1', '2020v4.2', '2020v1.1', + '2021v4.0','2021v4.1','2021v4.2','2021v4.3' ] # 2020 is experimental From 32dcd93cb2877997f1c636616cc3a77f790cbb9a Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Tue, 2 May 2023 14:49:02 -0400 Subject: [PATCH 05/39] add pipfile --- Pipfile | 14 ++++++++++ Pipfile.lock | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 Pipfile create mode 100644 Pipfile.lock diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..062e427 --- /dev/null +++ b/Pipfile @@ -0,0 +1,14 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +xmltodict = "*" +requests = "*" +unicodecsv = "*" + +[dev-packages] + +[requires] +python_version = "3.10" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..1607375 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,76 @@ +{ + "_meta": { + "hash": { + "sha256": "0bfbd74446ded500b3ebe1d62845c302a718290f8e5c4a4fb957f83ae55ea349" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.10" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "certifi": { + "hashes": [ + "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", + "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.6.15" + }, + "charset-normalizer": { + "hashes": [ + "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5", + "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2.1.0" + }, + "idna": { + "hashes": [ + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" + ], + "markers": "python_version >= '3.5'", + "version": "==3.3" + }, + "requests": { + "hashes": [ + "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", + "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" + ], + "index": "pypi", + "version": "==2.28.1" + }, + "unicodecsv": { + "hashes": [ + "sha256:018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc" + ], + "index": "pypi", + "version": "==0.14.1" + }, + "urllib3": { + "hashes": [ + "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc", + "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'", + "version": "==1.26.11" + }, + "xmltodict": { + "hashes": [ + "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56", + "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852" + ], + "index": "pypi", + "version": "==0.13.0" + } + }, + "develop": {} +} From 9305137b72b652e201fa42b35d0a760376f9abac Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Sun, 7 May 2023 15:18:05 -0400 Subject: [PATCH 06/39] add allowed versionstrings --- irs_reader/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/irs_reader/settings.py b/irs_reader/settings.py index 4045abc..101aec5 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -43,7 +43,7 @@ '2017v2.0', '2017v2.1', '2017v2.2', '2017v2.3', '2018v3.0', '2018v3.1', '2018v3.2', '2018v3.3', '2019v5.0', '2019v5.1', '2019v5.2', - '2020v1.0', '2020v2.0', '2020v3.0', '2020v4.0','2020v4.1', '2020v4.2', '2020v1.1', + '2020v1.0', '2020v1.1','2020v1.2','2020v1.3', '2020v2.0', '2020v3.0', '2020v4.0','2020v4.1', '2020v4.2', '2021v4.0','2021v4.1','2021v4.2','2021v4.3' ] From a275d64d536485406b3b94088b4fb79632a77c4f Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Mon, 8 May 2023 10:23:22 -0400 Subject: [PATCH 07/39] fix namespacing --- irs_reader/filing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/irs_reader/filing.py b/irs_reader/filing.py index e0e432e..a5e67fc 100644 --- a/irs_reader/filing.py +++ b/irs_reader/filing.py @@ -74,14 +74,14 @@ def _denamespacify(self,entity): This is very rare; see 201940149349301304_public.xml for an example. """ thisentitytype = type(entity) - if thisentitytype == orderedDictType: + if thisentitytype == orderedDictType or thisentitytype == dictType: newOD = OrderedDict() for key in entity.keys(): newkey = key if ":" in key: newkey = key.split(":")[1] newvalue = entity[key] - if type(newvalue) == listType or type(newvalue) == orderedDictType: + if type(newvalue) == listType or type(newvalue) == orderedDictType or type(newvalue) == dictType: newvalue = self._denamespacify(newvalue) newOD[newkey] = newvalue return newOD From 8308fc42e3fd03d8f178f11c1d72c2d9c775be01 Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Wed, 24 May 2023 10:23:30 -0400 Subject: [PATCH 08/39] add 2022 990x version strings --- irs_reader/settings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/irs_reader/settings.py b/irs_reader/settings.py index 101aec5..e7d6682 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -44,7 +44,8 @@ '2018v3.0', '2018v3.1', '2018v3.2', '2018v3.3', '2019v5.0', '2019v5.1', '2019v5.2', '2020v1.0', '2020v1.1','2020v1.2','2020v1.3', '2020v2.0', '2020v3.0', '2020v4.0','2020v4.1', '2020v4.2', - '2021v4.0','2021v4.1','2021v4.2','2021v4.3' + '2021v4.0','2021v4.1','2021v4.2','2021v4.3', + '2022v4.0','2022v4.1','2022v5.0' ] # 2020 is experimental From 46eeef135155e362e5eb64ee234da9f1ae672435 Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Tue, 1 Aug 2023 17:49:26 -0400 Subject: [PATCH 09/39] fix namespace function --- irs_reader/filing.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/irs_reader/filing.py b/irs_reader/filing.py index a5e67fc..172685b 100644 --- a/irs_reader/filing.py +++ b/irs_reader/filing.py @@ -79,7 +79,7 @@ def _denamespacify(self,entity): for key in entity.keys(): newkey = key if ":" in key: - newkey = key.split(":")[1] + newkey = key.split(":")[1] newvalue = entity[key] if type(newvalue) == listType or type(newvalue) == orderedDictType or type(newvalue) == dictType: newvalue = self._denamespacify(newvalue) @@ -90,7 +90,7 @@ def _denamespacify(self,entity): newlist = list() for item in entity: newvalue = item - if type(newvalue) == listType or type(newvalue) == orderedDictType: + if type(newvalue) == listType or type(newvalue) == orderedDictType or type(newvalue) == dictType: newvalue = self._denamespacify(newvalue) newlist.append(newvalue) return newlist @@ -103,7 +103,6 @@ def _set_dict_from_xml(self): with io.open(self.filepath, 'r', encoding='utf-8-sig') as fh: raw_file = fh.read() try: - self.raw_irs_dict = self._denamespacify(xmltodict.parse(raw_file)) except ExpatError: raise InvalidXMLException( From befcce04b69340d4f48f54762403aaab583ee9ff Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Mon, 21 Aug 2023 14:26:14 -0400 Subject: [PATCH 10/39] add new schema versions --- irs_reader/settings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/irs_reader/settings.py b/irs_reader/settings.py index e7d6682..6904a1d 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -45,7 +45,8 @@ '2019v5.0', '2019v5.1', '2019v5.2', '2020v1.0', '2020v1.1','2020v1.2','2020v1.3', '2020v2.0', '2020v3.0', '2020v4.0','2020v4.1', '2020v4.2', '2021v4.0','2021v4.1','2021v4.2','2021v4.3', - '2022v4.0','2022v4.1','2022v5.0' + '2022v4.0','2022v4.1','2022v5.0', + '2022v6.0','2022v7.0' ] # 2020 is experimental From 52911ac646c30f0b897b47c27b6c1ec35e48db3c Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Thu, 5 Oct 2023 14:55:21 -0400 Subject: [PATCH 11/39] capture attributes so we can parse subsection # --- irs_reader/metadata | 2 +- irs_reader/sked_dict_reader.py | 180 ++++++++++++++++----------------- metadata | 2 +- 3 files changed, 92 insertions(+), 92 deletions(-) diff --git a/irs_reader/metadata b/irs_reader/metadata index e79032e..d5885bc 160000 --- a/irs_reader/metadata +++ b/irs_reader/metadata @@ -1 +1 @@ -Subproject commit e79032e3d2988603348c9b81eb291ba77919e4b0 +Subproject commit d5885bc298f8867dcaa9817918699d4518ebdcb1 diff --git a/irs_reader/sked_dict_reader.py b/irs_reader/sked_dict_reader.py index a7e52c7..5c2e722 100644 --- a/irs_reader/sked_dict_reader.py +++ b/irs_reader/sked_dict_reader.py @@ -93,45 +93,45 @@ def _process_group(self, json_node, path, this_group): standardized_group_dict = self._get_table_start() for xpath in flattened_list_item.keys(): - if '@' in xpath: - continue - else: - xpath = xpath.replace("/#text", "") - value = flattened_list_item[xpath] - - if self.csv_format: - this_var = { - 'xpath':xpath, - 'value':value, - 'in_group':True, - 'group_name':this_group['db_name'], - 'group_index':node_index - } - self.for_csv_list.append(this_var) + # if '@' in xpath: + # continue + # else: + xpath = xpath.replace("/#text", "") + value = flattened_list_item[xpath] + + if self.csv_format: + this_var = { + 'xpath':xpath, + 'value':value, + 'in_group':True, + 'group_name':this_group['db_name'], + 'group_index':node_index + } + self.for_csv_list.append(this_var) - try: - this_var_data = self.standardizer.get_var(xpath) - except KeyError: - if not ignorable_keyerror(xpath): - self.variable_keyerrors.append( - {'element_path':xpath} - ) - continue - this_var_value = flattened_list_item[xpath] - this_var_name = this_var_data['db_name'] - table_name = this_var_data['db_table'] - if self.documentation: - result = { - 'value': this_var_value, - 'ordering': this_var_data['ordering'], - 'line_number': this_var_data['line_number'], - 'description': this_var_data['description'], - 'db_type': this_var_data['db_type'] - } - standardized_group_dict[this_var_name] = result + try: + this_var_data = self.standardizer.get_var(xpath) + except KeyError: + if not ignorable_keyerror(xpath): + self.variable_keyerrors.append( + {'element_path':xpath} + ) + continue + this_var_value = flattened_list_item[xpath] + this_var_name = this_var_data['db_name'] + table_name = this_var_data['db_table'] + if self.documentation: + result = { + 'value': this_var_value, + 'ordering': this_var_data['ordering'], + 'line_number': this_var_data['line_number'], + 'description': this_var_data['description'], + 'db_type': this_var_data['db_type'] + } + standardized_group_dict[this_var_name] = result - else: - standardized_group_dict[this_var_name] = this_var_value + else: + standardized_group_dict[this_var_name] = this_var_value try: self.repeating_groups[table_name].append(standardized_group_dict) except KeyError: @@ -155,67 +155,67 @@ def _parse_json(self, json_node, parent_path=""): elif this_node_type == unicodeType: # but ignore it if is an @. - if '@' in element_path: - pass - else: - element_path = element_path.replace("/#text", "") + # if '@' in element_path: + # pass + # else: + element_path = element_path.replace("/#text", "") + try: + # is it a group? + this_group = self.groups[element_path] + self._process_group( + [{parent_path: json_node}], + '', + this_group + ) + + except KeyError: + + # It's not a group so it should be a variable we know about + + if self.csv_format: + this_var = { + 'xpath':element_path, + 'value':json_node, + 'in_group':False, + 'group_name':None, + 'group_index':None + } + self.for_csv_list.append(this_var) + + # It's not a group so it should be a variable we know about try: - # is it a group? - this_group = self.groups[element_path] - self._process_group( - [{parent_path: json_node}], - '', - this_group - ) + var_data = self.standardizer.get_var(element_path) + var_found = True except KeyError: + # pass through for some common key errors + # [ TODO: FIX THE KEYERRORS! ] + if not ignorable_keyerror(element_path): + self.variable_keyerrors.append( + {'element_path':element_path} + ) + var_found = False + + if var_found: + + table_name = var_data['db_table'] + var_name = var_data['db_name'] - # It's not a group so it should be a variable we know about - - if self.csv_format: - this_var = { - 'xpath':element_path, - 'value':json_node, - 'in_group':False, - 'group_name':None, - 'group_index':None + result = json_node + if self.documentation: + result = { + 'value': json_node, + 'ordering': var_data['ordering'], + 'line_number': var_data['line_number'], + 'description': var_data['description'], + 'db_type': var_data['db_type'] } - self.for_csv_list.append(this_var) - # It's not a group so it should be a variable we know about try: - var_data = self.standardizer.get_var(element_path) - var_found = True - + self.schedule_parts[table_name][var_name] = result except KeyError: - # pass through for some common key errors - # [ TODO: FIX THE KEYERRORS! ] - if not ignorable_keyerror(element_path): - self.variable_keyerrors.append( - {'element_path':element_path} - ) - var_found = False - - if var_found: - - table_name = var_data['db_table'] - var_name = var_data['db_name'] - - result = json_node - if self.documentation: - result = { - 'value': json_node, - 'ordering': var_data['ordering'], - 'line_number': var_data['line_number'], - 'description': var_data['description'], - 'db_type': var_data['db_type'] - } - - try: - self.schedule_parts[table_name][var_name] = result - except KeyError: - self.schedule_parts[table_name] = self._get_table_start() - self.schedule_parts[table_name][var_name] = result + self.schedule_parts[table_name] = self._get_table_start() + self.schedule_parts[table_name][var_name] = result elif this_node_type == orderedDictType or this_node_type == dictType: diff --git a/metadata b/metadata index e79032e..d5885bc 160000 --- a/metadata +++ b/metadata @@ -1 +1 @@ -Subproject commit e79032e3d2988603348c9b81eb291ba77919e4b0 +Subproject commit d5885bc298f8867dcaa9817918699d4518ebdcb1 From b57066032f478ae138736b307e85ae9785d2b895 Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Thu, 5 Oct 2023 15:26:56 -0400 Subject: [PATCH 12/39] parse 990EZ subsection --- irs_reader/metadata | 2 +- metadata | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/irs_reader/metadata b/irs_reader/metadata index d5885bc..fbc5b8f 160000 --- a/irs_reader/metadata +++ b/irs_reader/metadata @@ -1 +1 @@ -Subproject commit d5885bc298f8867dcaa9817918699d4518ebdcb1 +Subproject commit fbc5b8f3f7baa2c611ed653d5429c93a8a9aa609 diff --git a/metadata b/metadata index d5885bc..fbc5b8f 160000 --- a/metadata +++ b/metadata @@ -1 +1 @@ -Subproject commit d5885bc298f8867dcaa9817918699d4518ebdcb1 +Subproject commit fbc5b8f3f7baa2c611ed653d5429c93a8a9aa609 From a919048bcb491b79ecb7d7103a08e7f63bc0bf19 Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Thu, 26 Oct 2023 15:45:55 -0400 Subject: [PATCH 13/39] change gitmodules to relative --- .gitmodules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 43c4886..47b82f7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "irs_reader/metadata"] path = irs_reader/metadata - url = git@github.com:propublica/990-xml-metadata.git + url = ../990-xml-metadata.git [submodule "metadata"] path = metadata - url = git@github.com:propublica/990-xml-metadata.git + url = ../990-xml-metadata.git From 5c6fd858f2bdc6fd71693b3c46876a66747b3882 Mon Sep 17 00:00:00 2001 From: Andrea Suozzo Date: Thu, 30 Nov 2023 10:27:11 -0500 Subject: [PATCH 14/39] add guesses for 2023 schema versions --- irs_reader/settings.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/irs_reader/settings.py b/irs_reader/settings.py index 6904a1d..b20ea4a 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -46,7 +46,15 @@ '2020v1.0', '2020v1.1','2020v1.2','2020v1.3', '2020v2.0', '2020v3.0', '2020v4.0','2020v4.1', '2020v4.2', '2021v4.0','2021v4.1','2021v4.2','2021v4.3', '2022v4.0','2022v4.1','2022v5.0', - '2022v6.0','2022v7.0' + '2022v6.0','2022v7.0', + # these are guesses for future 2023 schemas; they might not actually exist + '2023v1.0', + '2023v2.0', + '2023v3.0','2023v3.1','2023v3.2','2023v3.3', + '2023v4.0','2023v4.1','2023v4.2','2023v4.3', + '2023v5.0','2023v5.1','2023v5.2','2023v5.3', + '2023v6.0','2023v6.1','2023v6.2','2023v6.3', + '2023v7.0','2023v7.1','2023v7.2','2023v7.3', ] # 2020 is experimental From 1f989f4a93fed8fa7b9afa64cf1445fb671af041 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:14:40 -0400 Subject: [PATCH 15/39] use giving tuesday bucket, bring back downloads, get tests working also blacken and isort --- .gitmodules | 6 +- Pipfile | 14 - Pipfile.lock | 76 - irs_reader/_version.py | 2 +- irs_reader/dir_utils.py | 2 +- irs_reader/file_utils.py | 35 +- irs_reader/filing.py | 149 +- irs_reader/flatten_utils.py | 5 +- irs_reader/irsx_cli.py | 98 +- irs_reader/irsx_index_cli.py | 21 +- irs_reader/irsx_retrieve_cli.py | 60 +- irs_reader/keyerror_utils.py | 53 +- irs_reader/local_settings-example.py | 8 +- irs_reader/log_utils.py | 6 +- irs_reader/object_ids.py | 3008 +++++++++++++++++++++++++- irs_reader/settings.py | 141 +- irs_reader/sked_dict_reader.py | 165 +- irs_reader/standardizer.py | 115 +- irs_reader/text_format_utils.py | 157 +- irs_reader/type_utils.py | 4 +- irs_reader/xmlrunner.py | 101 +- metadata | 1 - setup.py | 85 +- tests.py | 242 --- tests/tests.py | 232 ++ tox.ini | 5 - 26 files changed, 3931 insertions(+), 860 deletions(-) delete mode 100644 Pipfile delete mode 100644 Pipfile.lock delete mode 160000 metadata delete mode 100644 tests.py create mode 100644 tests/tests.py delete mode 100644 tox.ini diff --git a/.gitmodules b/.gitmodules index 47b82f7..edf6f01 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ [submodule "irs_reader/metadata"] path = irs_reader/metadata - url = ../990-xml-metadata.git -[submodule "metadata"] - path = metadata - url = ../990-xml-metadata.git - + url = https://github.com/propublica/990-xml-metadata.git diff --git a/Pipfile b/Pipfile deleted file mode 100644 index 062e427..0000000 --- a/Pipfile +++ /dev/null @@ -1,14 +0,0 @@ -[[source]] -url = "https://pypi.org/simple" -verify_ssl = true -name = "pypi" - -[packages] -xmltodict = "*" -requests = "*" -unicodecsv = "*" - -[dev-packages] - -[requires] -python_version = "3.10" diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 1607375..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,76 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "0bfbd74446ded500b3ebe1d62845c302a718290f8e5c4a4fb957f83ae55ea349" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.10" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "certifi": { - "hashes": [ - "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", - "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" - ], - "markers": "python_full_version >= '3.6.0'", - "version": "==2022.6.15" - }, - "charset-normalizer": { - "hashes": [ - "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5", - "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413" - ], - "markers": "python_full_version >= '3.6.0'", - "version": "==2.1.0" - }, - "idna": { - "hashes": [ - "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", - "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" - ], - "markers": "python_version >= '3.5'", - "version": "==3.3" - }, - "requests": { - "hashes": [ - "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", - "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" - ], - "index": "pypi", - "version": "==2.28.1" - }, - "unicodecsv": { - "hashes": [ - "sha256:018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc" - ], - "index": "pypi", - "version": "==0.14.1" - }, - "urllib3": { - "hashes": [ - "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc", - "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'", - "version": "==1.26.11" - }, - "xmltodict": { - "hashes": [ - "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56", - "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852" - ], - "index": "pypi", - "version": "==0.13.0" - } - }, - "develop": {} -} diff --git a/irs_reader/_version.py b/irs_reader/_version.py index 73e3bb4..f9aa3e1 100644 --- a/irs_reader/_version.py +++ b/irs_reader/_version.py @@ -1 +1 @@ -__version__ = '0.3.2' +__version__ = "0.3.2" diff --git a/irs_reader/dir_utils.py b/irs_reader/dir_utils.py index 64ac79d..d218da6 100644 --- a/irs_reader/dir_utils.py +++ b/irs_reader/dir_utils.py @@ -3,7 +3,7 @@ def mkdir_p(paths): - """ Makedirs, from http://stackoverflow.com/a/600612 """ + """Makedirs, from http://stackoverflow.com/a/600612""" for path in paths: try: os.makedirs(path) diff --git a/irs_reader/file_utils.py b/irs_reader/file_utils.py index 3eaa2b0..7fe8882 100644 --- a/irs_reader/file_utils.py +++ b/irs_reader/file_utils.py @@ -1,11 +1,17 @@ -import re import os +import re +from datetime import datetime + import requests -from datetime import datetime -from .settings import IRS_XML_HTTP_BASE, WORKING_DIRECTORY, INDEX_DIRECTORY, IRS_INDEX_BASE +from .settings import ( + INDEX_DIRECTORY, + IRS_INDEX_BASE, + IRS_XML_HTTP_BASE, + WORKING_DIRECTORY, +) -OBJECT_ID_RE = re.compile(r'20\d{16}') +OBJECT_ID_RE = re.compile(r"20\d{16}") # Not sure how much detail we need to go into here OBJECT_ID_MSG = """ @@ -18,30 +24,28 @@ def stream_download(url, target_path, verbose=False): - """ Download a large file without loading it into memory. """ + """Download a large file without loading it into memory.""" response = requests.get(url, stream=True) handle = open(target_path, "wb") if verbose: print("Beginning streaming download of %s" % url) start = datetime.now() try: - content_length = int(response.headers['Content-Length']) - content_MB = content_length/1048576.0 + content_length = int(response.headers["Content-Length"]) + content_MB = content_length / 1048576.0 print("Total file size: %.2f MB" % content_MB) except KeyError: - pass # allow Content-Length to be missing + pass # allow Content-Length to be missing for chunk in response.iter_content(chunk_size=512): - if chunk: # filter out keep-alive new chunks + if chunk: # filter out keep-alive new chunks handle.write(chunk) if verbose: - print( - "Download completed to %s in %s" % - (target_path, datetime.now() - start)) + print("Download completed to %s in %s" % (target_path, datetime.now() - start)) def validate_object_id(object_id): - """ It's easy to make a mistake entering these, validate the format """ + """It's easy to make a mistake entering these, validate the format""" result = re.match(OBJECT_ID_RE, str(object_id)) if not result: print("'%s' appears not to be a valid 990 object_id" % object_id) @@ -49,9 +53,8 @@ def validate_object_id(object_id): return object_id -# Files are no longer available on S3 -# def get_s3_URL(object_id): -# return ("%s/%s_public.xml" % (IRS_XML_HTTP_BASE, object_id)) +def get_s3_URL(object_id): + return "%s/%s_public.xml" % (IRS_XML_HTTP_BASE, object_id) def get_local_path(object_id): diff --git a/irs_reader/filing.py b/irs_reader/filing.py index 172685b..918599b 100644 --- a/irs_reader/filing.py +++ b/irs_reader/filing.py @@ -1,17 +1,22 @@ -import os -import sys import io -import xmltodict import json +import os +import sys from collections import OrderedDict from xml.parsers.expat import ExpatError -from .type_utils import dictType, orderedDictType, listType, \ - unicodeType, noneType, strType -from .file_utils import stream_download, validate_object_id, \ - get_local_path +import xmltodict -from .settings import KNOWN_SCHEDULES, IRS_READER_ROOT +from .file_utils import get_local_path, get_s3_URL, stream_download, validate_object_id +from .settings import IRS_READER_ROOT, KNOWN_SCHEDULES +from .type_utils import ( + dictType, + listType, + noneType, + orderedDictType, + strType, + unicodeType, +) class InvalidXMLException(Exception): @@ -21,18 +26,18 @@ class InvalidXMLException(Exception): class FileMissingException(Exception): pass -class Filing(object): +class Filing(object): def __init__(self, object_id, filepath=None, URL=None, json=None): - """ Filepath is the location of the file locally; - URL is it's remote location (if not default) - Ignore these and defaults will be used. - If filepath is set, URL is ignored. - json is a json representation of the data, so if given, - no file will be downloaded. + """Filepath is the location of the file locally; + URL is it's remote location (if not default) + Ignore these and defaults will be used. + If filepath is set, URL is ignored. + json is a json representation of the data, so if given, + no file will be downloaded. """ - self.raw_irs_dict = None # The parsed xml will go here - self.version_string = None # Version number here + self.raw_irs_dict = None # The parsed xml will go here + self.version_string = None # Version number here self.object_id = validate_object_id(object_id) self.result = None @@ -42,10 +47,10 @@ def __init__(self, object_id, filepath=None, URL=None, json=None): if json: self.json = json - self.input_type = 'json' + self.input_type = "json" else: self.json = None - self.input_type = 'xml' + self.input_type = "xml" if filepath: self.filepath = filepath else: @@ -53,22 +58,21 @@ def __init__(self, object_id, filepath=None, URL=None, json=None): if URL: self.URL = URL - + else: + self.URL = get_s3_URL(self.object_id) def _download(self, force_overwrite=False, verbose=False): - """ - Files are no longer downloadable. - """ - - if os.path.isfile(self.filepath): - return True - else: - raise FileMissingException( - "Filing not available, try downloading with irsx_retrieve [ YEAR ]" - ) - - def _denamespacify(self,entity): + if not force_overwrite: + # If the file is already there, we're done + if os.path.isfile(self.filepath): + if verbose: + print("File already available at %s -- skipping" % (self.filepath)) + return False + stream_download(self.URL, self.filepath, verbose=verbose) + return True + + def _denamespacify(self, entity): """ It's legal to include namespaces in the xml tags, e.g. irs:Return instead of Return This is very rare; see 201940149349301304_public.xml for an example. @@ -79,9 +83,13 @@ def _denamespacify(self,entity): for key in entity.keys(): newkey = key if ":" in key: - newkey = key.split(":")[1] + newkey = key.split(":")[1] newvalue = entity[key] - if type(newvalue) == listType or type(newvalue) == orderedDictType or type(newvalue) == dictType: + if ( + type(newvalue) == listType + or type(newvalue) == orderedDictType + or type(newvalue) == dictType + ): newvalue = self._denamespacify(newvalue) newOD[newkey] = newvalue return newOD @@ -90,52 +98,57 @@ def _denamespacify(self,entity): newlist = list() for item in entity: newvalue = item - if type(newvalue) == listType or type(newvalue) == orderedDictType or type(newvalue) == dictType: + if ( + type(newvalue) == listType + or type(newvalue) == orderedDictType + or type(newvalue) == dictType + ): newvalue = self._denamespacify(newvalue) newlist.append(newvalue) return newlist - else: + else: return entity - def _set_dict_from_xml(self): - # io works across python2 and 3, and allows an encoding arg - with io.open(self.filepath, 'r', encoding='utf-8-sig') as fh: + # io works across python2 and 3, and allows an encoding arg + with io.open(self.filepath, "r", encoding="utf-8-sig") as fh: raw_file = fh.read() try: self.raw_irs_dict = self._denamespacify(xmltodict.parse(raw_file)) except ExpatError: raise InvalidXMLException( - "\nXML Parse error in " + self.filepath \ - + "\nFile may be damaged or incomplete.\n"\ + "\nXML Parse error in " + + self.filepath + + "\nFile may be damaged or incomplete.\n" + "Try erasing this file and downloading again." ) try: - self.raw_irs_dict['Return'] + self.raw_irs_dict["Return"] except KeyError: raise InvalidXMLException( - "'Return' element not located in" + self.filepath \ - + "\nFile may be damaged or incomplete.\n" \ + "'Return' element not located in" + + self.filepath + + "\nFile may be damaged or incomplete.\n" + "Try erasing this file and downloading again." ) - - def _set_dict_from_json(self): self.raw_irs_dict = self.json def _set_version(self): - self.version_string = self.raw_irs_dict['Return']['@returnVersion'] + self.version_string = self.raw_irs_dict["Return"]["@returnVersion"] def _set_ein(self): - self.ein = self.raw_irs_dict['Return']['ReturnHeader']['Filer']['EIN'] + self.ein = self.raw_irs_dict["Return"]["ReturnHeader"]["Filer"]["EIN"] def _set_schedules(self): - """ Attach the known and unknown schedules """ - self.schedules = ['ReturnHeader990x', ] + """Attach the known and unknown schedules""" + self.schedules = [ + "ReturnHeader990x", + ] self.otherforms = [] - for sked in self.raw_irs_dict['Return']['ReturnData'].keys(): + for sked in self.raw_irs_dict["Return"]["ReturnData"].keys(): if not sked.startswith("@"): if sked in KNOWN_SCHEDULES: self.schedules.append(sked) @@ -146,10 +159,10 @@ def get_object_id(self): return self.object_id def get_schedule(self, skedname): - if skedname == 'ReturnHeader990x': - return self.raw_irs_dict['Return']['ReturnHeader'] + if skedname == "ReturnHeader990x": + return self.raw_irs_dict["Return"]["ReturnHeader"] elif skedname in self.schedules: - return self.raw_irs_dict['Return']['ReturnData'][skedname] + return self.raw_irs_dict["Return"]["ReturnData"][skedname] else: return None @@ -158,7 +171,7 @@ def get_ein(self): def get_otherform(self, skedname): if skedname in self.otherforms: - return self.raw_irs_dict['Return']['ReturnData'][skedname] + return self.raw_irs_dict["Return"]["ReturnData"][skedname] else: return None @@ -182,7 +195,7 @@ def get_result(self): def set_csv_result(self, csv_result): self.csv_result = csv_result - + def get_csv_result(self): return self.csv_result @@ -191,29 +204,29 @@ def set_keyerrors(self, keyerrorlist): def get_keyerrors(self): return self.keyerrors - + def get_unparsed_json(self): - """ Json dicts are unordered """ + """Json dicts are unordered""" return json.dumps(self.raw_irs_dict) def get_type(self): - if 'IRS990' in self.schedules: - return 'IRS990' - elif 'IRS990EZ' in self.schedules: - return 'IRS990EZ' - elif 'IRS990PF' in self.schedules: - return 'IRS990PF' + if "IRS990" in self.schedules: + return "IRS990" + elif "IRS990EZ" in self.schedules: + return "IRS990EZ" + elif "IRS990PF" in self.schedules: + return "IRS990PF" else: raise Exception("Missing 990/990EZ/990PF-is this filing valid?") def get_parsed_sked(self, skedname): - """ Returns an array because multiple sked K's are allowed""" + """Returns an array because multiple sked K's are allowed""" if not self.processed: raise Exception("Filing must be processed to return parsed sked") if skedname in self.schedules: matching_skeds = [] for sked in self.result: - if sked['schedule_name']==skedname: + if sked["schedule_name"] == skedname: matching_skeds.append(sked) return matching_skeds else: @@ -222,11 +235,11 @@ def get_parsed_sked(self, skedname): def process(self, verbose=False): # don't reprocess inadvertently if not self.processed: - self.processed=True + self.processed = True if self.json: self._set_dict_from_json() else: - + self._download(verbose=verbose) self._set_dict_from_xml() diff --git a/irs_reader/flatten_utils.py b/irs_reader/flatten_utils.py index aa73f1d..b2afe6b 100644 --- a/irs_reader/flatten_utils.py +++ b/irs_reader/flatten_utils.py @@ -1,8 +1,9 @@ import collections + # Mostly from: http://stackoverflow.com/a/6027615 -def flatten(d, parent_key='', sep='/'): +def flatten(d, parent_key="", sep="/"): items = [] if d: for k, v in d.items(): @@ -10,7 +11,7 @@ def flatten(d, parent_key='', sep='/'): if isinstance(v, collections.abc.MutableMapping): items.extend(flatten(v, new_key, sep=sep).items()) else: - new_key = new_key.replace("/#text","") + new_key = new_key.replace("/#text", "") items.append((new_key, v)) return dict(items) else: diff --git a/irs_reader/irsx_cli.py b/irs_reader/irsx_cli.py index 14d0a1d..6b7a57c 100644 --- a/irs_reader/irsx_cli.py +++ b/irs_reader/irsx_cli.py @@ -1,70 +1,58 @@ import argparse from .filing import Filing -from .settings import KNOWN_SCHEDULES, IRS_READER_ROOT -from .xmlrunner import XMLRunner +from .settings import IRS_READER_ROOT, KNOWN_SCHEDULES from .text_format_utils import * +from .xmlrunner import XMLRunner def get_parser(): parser = argparse.ArgumentParser("irsx") parser.add_argument( - 'object_ids', - metavar='object_ids', - type=int, - nargs='+', - help='object ids' + "object_ids", metavar="object_ids", type=int, nargs="+", help="object ids" ) parser.add_argument( - '--verbose', - dest='verbose', - action='store_const', - const=True, default=False, - help='Verbose output' + "--verbose", + dest="verbose", + action="store_const", + const=True, + default=False, + help="Verbose output", ) parser.add_argument( "--schedule", choices=KNOWN_SCHEDULES, default=None, - help='Get only that schedule' + help="Get only that schedule", ) parser.add_argument( "--xpath", - dest='documentation', - action='store_const', - const=True, default=False, - help='show xpath in text format' - ) - parser.add_argument( - "--format", - choices=['json', 'csv', 'txt'], - default='json', - help='Output format' + dest="documentation", + action="store_const", + const=True, + default=False, + help="show xpath in text format", ) parser.add_argument( - "--file", - default=None, - help='Write result to file' + "--format", choices=["json", "csv", "txt"], default="json", help="Output format" ) + parser.add_argument("--file", default=None, help="Write result to file") parser.add_argument( - '--list_schedules', - dest='list_schedules', - action='store_const', + "--list_schedules", + dest="list_schedules", + action="store_const", const=True, default=False, - help='Only list schedules' + help="Only list schedules", ) return parser def run_main(args_read): - csv_format = args_read.format == 'csv' or args_read.format == 'txt' - xml_runner = XMLRunner( - documentation=args_read.documentation, - csv_format=csv_format - ) + csv_format = args_read.format == "csv" or args_read.format == "txt" + xml_runner = XMLRunner(documentation=args_read.documentation, csv_format=csv_format) # Use the standardizer that was init'ed by XMLRunner standardizer = xml_runner.get_standardizer() @@ -84,35 +72,32 @@ def run_main(args_read): else: if args_read.schedule: parsed_filing = xml_runner.run_sked( - object_id, - args_read.schedule, - verbose=args_read.verbose + object_id, args_read.schedule, verbose=args_read.verbose ) else: parsed_filing = xml_runner.run_filing( - object_id, - verbose=args_read.verbose + object_id, verbose=args_read.verbose ) - if args_read.format == 'json': + if args_read.format == "json": to_json(parsed_filing.get_result(), outfilepath=args_read.file) - elif args_read.format == 'csv': - to_csv( - parsed_filing, - object_id=object_id, - standardizer=standardizer, - documentation=args_read.documentation, - outfilepath=args_read.file - ) + elif args_read.format == "csv": + to_csv( + parsed_filing, + object_id=object_id, + standardizer=standardizer, + documentation=args_read.documentation, + outfilepath=args_read.file, + ) - elif args_read.format == 'txt': - to_txt( - parsed_filing, - standardizer=standardizer, - documentation=args_read.documentation, - outfilepath=args_read.file - ) + elif args_read.format == "txt": + to_txt( + parsed_filing, + standardizer=standardizer, + documentation=args_read.documentation, + outfilepath=args_read.file, + ) def main(args=None): @@ -121,5 +106,6 @@ def main(args=None): run_main(args_read) print("\n") + if __name__ == "__main__": main() diff --git a/irs_reader/irsx_index_cli.py b/irs_reader/irsx_index_cli.py index 89fe0f4..2df69a1 100644 --- a/irs_reader/irsx_index_cli.py +++ b/irs_reader/irsx_index_cli.py @@ -1,11 +1,11 @@ -import sys import argparse +import sys from datetime import date -from .file_utils import get_index_file_URL, get_local_index_path, \ - stream_download + +from .file_utils import get_index_file_URL, get_local_index_path, stream_download this_year = date.today().year -INDEXED_YEARS = [str(i) for i in range(2011, this_year+1)] +INDEXED_YEARS = [str(i) for i in range(2011, this_year + 1)] def get_cli_index_parser(): @@ -14,15 +14,16 @@ def get_cli_index_parser(): "--year", choices=INDEXED_YEARS, default=None, - help='Optionally update an index file' + help="Optionally update an index file", ) parser.add_argument( - '--verbose', - dest='verbose', - action='store_const', - const=True, default=False, - help='Verbose output' + "--verbose", + dest="verbose", + action="store_const", + const=True, + default=False, + help="Verbose output", ) return parser diff --git a/irs_reader/irsx_retrieve_cli.py b/irs_reader/irsx_retrieve_cli.py index 3756949..27eba50 100644 --- a/irs_reader/irsx_retrieve_cli.py +++ b/irs_reader/irsx_retrieve_cli.py @@ -1,40 +1,39 @@ -import sys -import os import argparse +import os +import sys from zipfile import ZipFile + from .file_utils import stream_download from .settings import WORKING_DIRECTORY IRS_location = "https://apps.irs.gov/pub/epostcard/990/xml/%s/download990xml_%s" ref_url = "https://www.irs.gov/charities-non-profits/form-990-series-downloads" -# How many files are available per year? +# How many files are available per year? # https://www.irs.gov/charities-non-profits/form-990-series-downloads number_of_files = { - '2022':0, - '2021':6, - '2020':8, - '2019':8, - '2018':7, - '2017':7, - '2016':6, - '2015':2 + "2022": 0, + "2021": 6, + "2020": 8, + "2019": 8, + "2018": 7, + "2017": 7, + "2016": 6, + "2015": 2, } + def get_cli_retrieve_parser(): parser = argparse.ArgumentParser("Irsreader") - parser.add_argument( - "year", - nargs='+', - help='4-digit year to retrieve, ' - ) + parser.add_argument("year", nargs="+", help="4-digit year to retrieve, ") parser.add_argument( - '--verbose', - dest='verbose', - action='store_const', - const=True, default=False, - help='Verbose output' + "--verbose", + dest="verbose", + action="store_const", + const=True, + default=False, + help="Verbose output", ) return parser @@ -47,14 +46,15 @@ def download_unzip_erase(remote_url, verbose=False): print("Downloading %s to %s" % (remote_url, local_path)) stream_download(remote_url, local_path, verbose=verbose) - with ZipFile(local_path, 'r') as zipObj: - # Extract all the contents of zip file in different directory - print('Unzipping %s to %s' % (local_path, WORKING_DIRECTORY)) + with ZipFile(local_path, "r") as zipObj: + # Extract all the contents of zip file in different directory + print("Unzipping %s to %s" % (local_path, WORKING_DIRECTORY)) zipObj.extractall(WORKING_DIRECTORY) print("Cleaning up, removing raw file.") os.remove(local_path) + def unload_zipfile_by_year(year, verbose=False): print("Retrieving zipfiles for year %s" % year) if verbose: @@ -68,7 +68,7 @@ def unload_zipfile_by_year(year, verbose=False): file_list.append(location_base + ".zip") if num_files > 0: - for i in range(1, num_files+1): + for i in range(1, num_files + 1): file_list.append(location_base + "_" + str(i) + ".zip") for this_file in file_list: @@ -76,18 +76,22 @@ def unload_zipfile_by_year(year, verbose=False): def run_cli_retrieve_main(args_read): - print(""" + print( + """ Please visit https://www.irs.gov/charities-non-profits/form-990-series-downloads To see if any additional files are available. - """) + """ + ) for year in args_read.year: print("Processing %s files for year %s" % (year, number_of_files[year])) unload_zipfile_by_year(year, verbose=args_read.verbose) + def main(args=None): parser = get_cli_retrieve_parser() args = parser.parse_args() run_cli_retrieve_main(args) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/irs_reader/keyerror_utils.py b/irs_reader/keyerror_utils.py index 6d7ac2e..6e419c5 100644 --- a/irs_reader/keyerror_utils.py +++ b/irs_reader/keyerror_utils.py @@ -1,16 +1,61 @@ - -ignorable_keyerrors = ['/ReturnHeader/BuildTS'] +ignorable_keyerrors = ["/ReturnHeader/BuildTS"] ## Todo: put in 2013 / 2015 series canonicals. # 2013 vars that no longer exist -discontinued_2013_vars = [ '/IRS990ScheduleA/CertificationInd', '/IRS990ScheduleA/Contribution35ControlledInd', '/IRS990ScheduleA/ContributionControllerInd', '/IRS990ScheduleA/ContributionFamilyInd', '/IRS990ScheduleA/Form990ScheduleAPartIVGrp/ExplanationTxt', '/IRS990ScheduleA/SupportedOrgInformationGrp/SupportedOrgNotifiedInd', '/IRS990ScheduleA/SupportedOrgInformationGrp/USOrganizedInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AdoptBudgetInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AdoptImplementationStrategyInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AllNeedsAddressedInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AttachedToInvoiceInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AvailableOnRequestInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/BodyAttachmentsInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/DevelopCommunityWidePlanInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/ExecCommunityWidePlanInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/ExecImplementationStrategyInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FPGUsedDeterEligFreeCareInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FPGUsedDetermEligDscntCareInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/IncludeOperationalPlanInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/LawsuitInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/LiensOnResidencesInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/MedicaidMedicareInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/OtherNeedsAddressedInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PermitBodyAttachmentsInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PermitLawsuitInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PermitLienOnResidenceInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PostedInAdmissionOfficeInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PostedInEmergencyRoomInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PrioritizeHealthNeedsInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PrioritizeServicesInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/ProvidedOnAdmissionInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/StateRegulationInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/UninsuredDiscountInd'] +discontinued_2013_vars = [ + "/IRS990ScheduleA/CertificationInd", + "/IRS990ScheduleA/Contribution35ControlledInd", + "/IRS990ScheduleA/ContributionControllerInd", + "/IRS990ScheduleA/ContributionFamilyInd", + "/IRS990ScheduleA/Form990ScheduleAPartIVGrp/ExplanationTxt", + "/IRS990ScheduleA/SupportedOrgInformationGrp/SupportedOrgNotifiedInd", + "/IRS990ScheduleA/SupportedOrgInformationGrp/USOrganizedInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AdoptBudgetInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AdoptImplementationStrategyInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AllNeedsAddressedInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AttachedToInvoiceInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AvailableOnRequestInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/BodyAttachmentsInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/DevelopCommunityWidePlanInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/ExecCommunityWidePlanInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/ExecImplementationStrategyInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FPGUsedDeterEligFreeCareInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FPGUsedDetermEligDscntCareInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/IncludeOperationalPlanInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/LawsuitInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/LiensOnResidencesInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/MedicaidMedicareInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/OtherNeedsAddressedInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PermitBodyAttachmentsInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PermitLawsuitInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PermitLienOnResidenceInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PostedInAdmissionOfficeInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PostedInEmergencyRoomInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PrioritizeHealthNeedsInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/PrioritizeServicesInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/ProvidedOnAdmissionInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/StateRegulationInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/UninsuredDiscountInd", +] # 2015 skedh vars removed -discontinued_2015_vars = ['/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AverageNegotiatedRatesInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/DocumentedEligDeterminationInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNoticeDisplayedInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNotifiedAllPatientsInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNotifiedBeforeDischargeInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNotifiedUponAdmissionInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/InformationGapsInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/LowestNegotiatedRatesInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/MedicareRatesInd', '/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/OtherMethodUsedInd'] +discontinued_2015_vars = [ + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/AverageNegotiatedRatesInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/DocumentedEligDeterminationInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNoticeDisplayedInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNotifiedAllPatientsInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNotifiedBeforeDischargeInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/FAPNotifiedUponAdmissionInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/InformationGapsInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/LowestNegotiatedRatesInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/MedicareRatesInd", + "/IRS990ScheduleH/HospitalFcltyPoliciesPrctcGrp/OtherMethodUsedInd", +] ignorable = {} for key in ignorable_keyerrors + discontinued_2013_vars + discontinued_2015_vars: ignorable[key] = 1 + def ignorable_keyerror(xpath): try: ignorable[xpath] diff --git a/irs_reader/local_settings-example.py b/irs_reader/local_settings-example.py index 5201003..c3d02bb 100644 --- a/irs_reader/local_settings-example.py +++ b/irs_reader/local_settings-example.py @@ -1,6 +1,6 @@ import os -from .dir_utils import mkdir_p +from .dir_utils import mkdir_p IRS_READER_ROOT = "/path/to/irsreader/990-xml-reader" @@ -8,9 +8,9 @@ IRS_XML_HTTP_BASE = "https://s3.amazonaws.com/irs-form-990" # The directory we put files in while we're processing them -WORKING_DIRECTORY = (os.path.join(IRS_READER_ROOT, "XML") ) +WORKING_DIRECTORY = os.path.join(IRS_READER_ROOT, "XML") # Helpful to keep these around for lookup purposes -INDEX_DIRECTORY = (os.path.join(IRS_READER_ROOT, "CSV") ) +INDEX_DIRECTORY = os.path.join(IRS_READER_ROOT, "CSV") -mkdir_p([WORKING_DIRECTORY, INDEX_DIRECTORY]) \ No newline at end of file +mkdir_p([WORKING_DIRECTORY, INDEX_DIRECTORY]) diff --git a/irs_reader/log_utils.py b/irs_reader/log_utils.py index c65584d..e6ebb0c 100644 --- a/irs_reader/log_utils.py +++ b/irs_reader/log_utils.py @@ -1,5 +1,6 @@ import logging -from .settings import LOG_KEY, KEYERROR_LOG + +from .settings import KEYERROR_LOG, LOG_KEY def configure_logging(name=LOG_KEY): @@ -7,7 +8,8 @@ def configure_logging(name=LOG_KEY): logger.setLevel(logging.INFO) # Format formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s") + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) # Setup console logging ch = logging.StreamHandler() diff --git a/irs_reader/object_ids.py b/irs_reader/object_ids.py index 556be07..0027928 100644 --- a/irs_reader/object_ids.py +++ b/irs_reader/object_ids.py @@ -1,12 +1,3012 @@ # poor person's sampling--just grab the first 1000 rows per year to test. # csvcut -c 9 index_2017.csv | head -n 1000 -object_ids_2017 = ['201612439349300006', '201612439349300026', '201612439349300341', '201612439349300516', '201612439349300546', '201612439349300601', '201612439349300621', '201612439349300746', '201612439349300861', '201612449349100601', '201612449349100706', '201612449349200001', '201612449349200101', '201612449349200111', '201612449349200121', '201612449349200756', '201612449349200761', '201612449349200806', '201612449349200906', '201612449349300216', '201612449349300336', '201612449349300406', '201612449349300606', '201612449349300636', '201612449349300756', '201612449349300786', '201612449349301061', '201612449349301071', '201612449349301076', '201612459349100321', '201612459349200111', '201612459349200206', '201612459349200321', '201612459349200431', '201612459349200506', '201612459349200711', '201612459349300241', '201612459349300301', '201632519349300958', '201632469349300108', '201632519349300313', '201642509349300319', '201642509349300704', '201642509349300209', '201642469349300244', '201632529349300833', '201632519349301153', '201642519349300434', '201602379349100415', '201602379349100605', '201602379349300230', '201602379349300530', '201602389349100605', '201602389349300545', '201602389349300845', '201602399349200620', '201602399349200715', '201602399349300310', '201602399349300330', '201602399349300410', '201602399349300605', '201602399349300610', '201602399349300625', '201602399349300700', '201602399349300725', '201602399349300800', '201602399349300900', '201602399349300915', '201602409349200000', '201602409349200200', '201602409349200700', '201612459349300346', '201612459349300426', '201612459349300431', '201612459349300536', '201612459349300626', '201612459349300816', '201612459349300971', '201612459349300976', '201612459349301101', '201612459349301111', '201612469349200411', '201612469349300206', '201612469349300231', '201612469349300426', '201612469349300501', '201612469349300601', '201612509349300101', '201612509349300166', '201612509349300211', '201612509349300356', '201612509349300476', '201612509349300501', '201612509349300621', '201612519349200021', '201612519349200611', '201612519349200736', '201612519349300136', '201612519349300141', '201612519349300146', '201612519349300206', '201612519349300601', '201612519349300766', '201612519349300826', '201612519349300901', '201612519349300951', '201612529349200701', '201612529349300801', '201612529349300836', '201612529349301251', '201622449349300427', '201602429349200000', '201602429349200100', '201602429349200310', '201602429349200615', '201602429349200640', '201602439349300045', '201602439349300415', '201602439349300535', '201602449349300910', '201602449349301055', '201602449349301090', '201602449349301405', '201602459349300345', '201602459349300405', '201602459349300715', '201602459349301055', '201602509349300350', '201602529349200405', '201602529349301100', '201602529349301155', '201602589349100020', '201602599349100130', '201612539349100751', '201602589349100610', '201602589349100210', '201622569349100422', '201622579349100302', '201602599349100000', '201602589349100310', '201602589349100515', '201602599349100405', '201602589349100120', '201622569349100452', '201622579349100117', '201602589349100300', '201602589349100205', '201602599349100630', '201602599349100310', '201622569349100312', '201602579349100515', '201602589349100105', '201602579349100705', '201602569349100600', '201622579349100017', '201602149349301245', '201602159349300625', '201642089349300449', '201642049349300519', '201632099349301278', '201632049349300243', '201612099349301021', '201641979349301154', '201641599349300219', '201631619349300243', '201612109349301306', '201642029349300864', '201602149349300040', '201602049349300800', '201632089349300823', '201622109349300627', '201602079349301200', '201632099349301353', '201632079349301368', '201632039349300953', '201602159349300225', '201602599349100435', '201612539349100406', '201602599349100030', '201602599349100640', '201602599349100430', '201602599349100625', '201602599349100205', '201602579349100200', '201602579349100115', '201602579349100105', '201602569349100085', '201622539349100702', '201622569349100122', '201612549349100401', '201612539349100001', '201612539349100311', '201622599349100022', '201602589349100625', '201602579349100715', '201602579349100100', '201602579349100005', '201602539349200810', '201602549349200005', '201602539349200000', '201602539349200510', '201602539349200800', '201602539349200230', '201602539349200805', '201602549349200600', '201602539349200310', '201622579349200247', '201602539349200010', '201602539349200135', '201602539349200605', '201602539349200005', '201602549349200105', '201602539349200600', '201602549349200100', '201602539349200525', '201602539349200300', '201602539349200520', '201602539349200235', '201602539349200710', '201602539349200115', '201602539349200515', '201602599349100215', '201612569349100466', '201602589349100400', '201612599349100301', '201612599349100116', '201602589349100810', '201612569349100501', '201602589349100005', '201622539349100802', '201612569349100401', '201602599349100515', '201602599349100210', '201602589349100805', '201602589349100000', '201612589349100511', '201622549349100002', '201622539349100312', '201612599349100136', '201612579349100706', '201612589349100421', '201602579349100000', '201602579349100210', '201612589349100411', '201612589349100406', '201612589349100326', '201602549349200700', '201602539349200110', '201602539349200225', '201612599349100106', '201612539349100616', '201602599349100530', '201622579349100702', '201622539349100502', '201622579349100412', '201612549349100501', '201612539349100401', '201602589349100750', '201602599349100135', '201602589349100510', '201602599349100420', '201602589349100110', '201602599349100535', '201602589349100015', '201622539349100757', '201622549349100302', '201622599349100002', '201622579349100802', '201622569349100617', '201622539349100007', '201622569349100512', '201612599349100211', '201612569349100616', '201612569349200726', '201612579349200761', '201612599349200636', '201612569349200621', '201612599349200606', '201612599349200131', '201612589349200501', '201622589349200342', '201622589349200132', '201612569349200136', '201612599349200121', '201612569349200011', '201612569349200751', '201612569349200001', '201612579349200641', '201612599349200711', '201612599349201111', '201612569349200616', '201612599349200611', '201612599349200541', '201612589349200441', '201642239349302074', '201632249349301788', '201632199349300108', '201602289349302985', '201602259349301770', '201642239349302474', '201602319349300950', '201642249349303004', '201632249349303053', '201632249349301818', '201642229349301129', '201602259349303350', '201602289349301880', '201632229349302013', '201622249349302207', '201632229349300443', '201642219349301344', '201642229349300884', '201642229349301644', '201602289349303935', '201632239349302363', '201642249349301329', '201602599349100635', '201612539349100301', '201602539349100760', '201612589349100626', '201612569349100606', '201602599349100525', '201602599349100400', '201612569349100506', '201622539349100767', '201612569349100461', '201602549349100400', '201602579349100510', '201602579349100800', '201612539349100306', '201602589349100710', '201602599349100020', '201602599349100010', '201602589349100410', '201612599349100201', '201612599349100006', '201612589349100601', '201622589349100717', '201612579349100511', '201602189349300440', '201642169349301589', '201632289349203713', '201632289349203468', '201632289349203398', '201632289349204603', '201632289349203743', '201632289349203373', '201632289349203233', '201632289349203158', '201632289349202553', '201632289349204143', '201632289349204153', '201632289349204003', '201632289349204313', '201632289349203663', '201632289349203628', '201632289349203563', '201632289349202753', '201632289349202633', '201632319349200013', '201632289349204248', '201632289349203993', '201632179349301418', '201612229349301241', '201602239349301585', '201602189349300985', '201602189349300760', '201642169349300104', '201612179349300641', '201622159349300027', '201642159349300334', '201602239349302575', '201602189349300865', '201632169349300528', '201622169349301667', '201622159349300787', '201602229349301885', '201612149349301421', '201642179349301574', '201602169349300300', '201602179349301510', '201622119349301027', '201642149349301339', '201622159349301767', '201612189349300641', '201612249349300436', '201612229349301486', '201612219349301091', '201602249349300445', '201602229349301920', '201602219349300525', '201602239349302375', '201602249349302260', '201622229349301242', '201612229349301306', '201602199349300500', '201622219349301032', '201622189349300897', '201602219349302255', '201602189349300950', '201602249349301185', '201612229349300146', '201612249349301301', '201632289349203813', '201632289349203808', '201632289349203568', '201632289349202108', '201632289349204358', '201632289349203223', '201632289349202923', '201642259349201179', '201642259349200709', '201642259349200424', '201632319349200113', '201632319349200033', '201632299349201003', '201632299349200933', '201632289349204413', '201632289349202683', '201632289349202558', '201632289349201388', '201632289349201243', '201632289349200923', '201632269349200133', '201642259349200549', '201632309349200418', '201632289349204273', '201632289349204258', '201632289349203703', '201632289349202983', '201632289349201408', '201632289349201353', '201632289349200228', '201642289349202399', '201642289349201969', '201642289349201744', '201642319349200329', '201642289349202324', '201642289349201379', '201642289349203804', '201642289349203564', '201642289349202134', '201642289349201074', '201642319349200319', '201642289349203174', '201642289349202924', '201642289349202874', '201642289349201979', '201642289349200829', '201642319349200719', '201612229349301736', '201612189349301406', '201622229349300527', '201602239349301695', '201612569349100081', '201622569349100457', '201612579349100001', '201622539349100412', '201612579349100206', '201612599349100311', '201612599349100506', '201612599349100401', '201612589349100206', '201612589349100016', '201622579349100312', '201622579349100402', '201602579349100610', '201612569349100076', '201612569349100006', '201612569349100001', '201622569349100072', '201622559349100002', '201612589349100211', '201612589349100126', '201602599349200910', '201602569349200640', '201602599349200745', '201602539349200405', '201602569349200635', '201602579349200715', '201602569349200860', '201602559349200050', '201602569349200135', '201602569349200405', '201602569349200000', '201602569349200530', '201602569349200240', '201602569349200865', '201602589349200120', '201602589349200520', '201602589349200300', '201602589349201005', '201602589349200725', '201602589349200005', '201602559349200000', '201602569349200125', '201602549349200500', '201642289349203299', '201642289349202499', '201642289349201729', '201632299349200913', '201632279349200808', '201632319349200038', '201632289349202428', '201632289349201613', '201632269349200003', '201632309349200718', '201632299349200903', '201632289349202593', '201642259349201084', '201642259349200719', '201632319349200128', '201632309349200438', '201632289349200938', '201632289349200853', '201632269349200613', '201642259349200334', '201632289349202218', '201632289349201508', '201632289349200208', '201632309349200528', '201632289349202408', '201632289349201643', '201632289349200413', '201632289349204608', '201632289349202943', '201632289349202363', '201632289349202008', '201632289349201988', '201632289349201773', '201632289349201578', '201632289349200003', '201632289349204233', '201632289349203433', '201632289349200433', '201632289349202213', '201632289349200223', '201632289349204103', '201632289349203348', '201632289349204238', '201632289349203138', '201632289349201733', '201632289349203003', '201632289349202803', '201632289349204378', '201602599349200945', '201602599349200920', '201602599349200325', '201602309349301000', '201602299349300525', '201632259349301783', '201602289349305800', '201612259349302576', '201602289349306480', '201602289349303740', '201602079349300950', '201612089349301256', '201612109349301301', '201642039349300014', '201632079349301123', '201642019349300124', '201641619349300539', '201642089349301274', '201632099349300403', '201602289349305745', '201612259349303741', '201612259349302571', '201612259349301956', '201602259349301840', '201642229349301139', '201602289349305390', '201632259349301883', '201622569349100052', '201612599349100141', '201612549349100001', '201612589349100021', '201612579349100011', '201612599349100411', '201622569349100132', '201622589349100422', '201622589349100507', '201602599349100510', '201602599349100140', '201602599349100105', '201622569349100302', '201622569349100002', '201622569349100082', '201622569349100077', '201622569349100612', '201612599349100216', '201612599349100121', '201612579349100016', '201632289349200143', '201632269349200603', '201631609349200708', '201642179349200249', '201642179349200439', '201602289349202390', '201602259349201435', '201612049349200001', '201612149349200121', '201612169349200301', '201612219349200731', '201612249349202001', '201602239349200910', '201632289349201078', '201632289349201113', '201632279349200038', '201632289349202003', '201632289349201973', '201632289349201143', '201632289349200028', '201632279349200223', '201632279349200713', '201632269349200233', '201632239349300138', '201622249349302802', '201622249349301907', '201642229349301629', '201632249349301433', '201632229349301808', '201642229349301344', '201602289349301630', '201642249349301799', '201642239349301989', '201642249349302259', '201642249349300714', '201602259349302660', '201632219349300933', '201632229349301128', '201622249349303002', '201632229349301373', '201642219349302219', '201632249349302148', '201622229349301197', '201622249349301172', '201612569349100086', '201632289349102063', '201632289349100543', '201632289349100508', '201632289349101548', '201632289349100748', '201632319349100103', '201632289349100003', '201622239349301852', '201632289349303273', '201632289349305258', '201632289349306863', '201632289349303708', '201632289349306598', '201632289349306363', '201632289349302613', '201632289349301978', '201632319349300748', '201632289349305448', '201632299349300208', '201632289349304968', '201632289349306638', '201632289349307128', '201632289349305023', '201632289349304128', '201632289349305693', '201632289349303163', '201632289349303643', '201632289349301418', '201622579349100512', '201612539349100811', '201622589349100512', '201622569349100467', '201622569349100307', '201622589349100407', '201622589349100002', '201622579349100507', '201622569349100117', '201622589349100427', '201602579349100600', '201602579349100400', '201622559349100152', '201622539349100407', '201612539349100851', '201622569349100112', '201622569349100102', '201602599349100425', '201622579349100012', '201622589349100807', '201622569349100552', '201622589349100417', '201622589349100327', '201622589349100207', '201622579349100307', '201612329349100001', '201612329349100501', '201612329349100506', '201612329349100706', '201612329349200006', '201612329349200226', '201612329349200306', '201612329349200526', '201612329349200801', '201612329349300301', '201632289349101028', '201632289349101583', '201632289349102008', '201632289349100418', '201642259349100004', '201632299349100303', '201632309349100303', '201642259349100544', '201632299349100003', '201642259349101114', '201632289349100938', '201632289349100623', '201632289349101358', '201632289349101918', '201632289349101198', '201632289349100913', '201632289349101578', '201632289349100733', '201632269349100403', '201642289349101929', '201642289349100929', '201642289349101909', '201642289349101564', '201602329349100705', '201642289349101289', '201642269349100304', '201642289349101274', '201602329349100805', '201602339349100100', '201642299349100714', '201642299349100709', '201642289349100944', '201642289349101879', '201642289349101519', '201642289349100734', '201642289349100414', '201642289349101689', '201642289349100224', '201642289349100349', '201642289349100214', '201642229349301184', '201642249349301549', '201632249349300013', '201642229349300839', '201642229349300909', '201632249349302078', '201632249349300513', '201632249349300503', '201632249349301498', '201642249349302379', '201642249349302039', '201642249349301544', '201642249349300729', '201642249349301614', '201642229349301239', '201642229349301324', '201642229349301054', '201642229349301179', '201642219349301764', '201642219349302074', '201642219349301564', '201602579349300515', '201612579349300236', '201602569349300880', '201602569349300420', '201602579349300825', '201602569349300315', '201602579349300310', '201602579349300315', '201602579349301210', '201602569349300900', '201602569349300785', '201602559349300155', '201602569349300110', '201602559349300350', '201612539349300006', '201612589349301136', '201602579349301265', '201602579349300520', '201602579349300855', '201602579349300400', '201602579349300895', '201602579349300720', '201602579349300885', '201602579349300820', '201602569349301265', '201602569349301000', '201612329349300631', '201612329349300646', '201612329349301101', '201612329349301151', '201612329349301206', '201612339349100201', '201612339349200301', '201612339349200501', '201612349349200251', '201612359349100351', '201612359349100361', '201612359349100411', '201612359349200041', '201612359349200256', '201612359349200501', '201612359349300216', '201612359349300461', '201612359349300501', '201612359349300721', '201612359349300901', '201612359349301006', '201612369349100756', '201612369349200111', '201612369349200406', '201612369349200416', '201612369349200531', '201612369349300001', '201612369349300011', '201612369349300201', '201612369349300301', '201612369349300336', '201612369349300346', '201612369349300506', '201612369349300611', '201612369349300726', '201612369349300906', '201612369349301156', '201612379349100001', '201602539349100765', '201602539349100210', '201602539349100805', '201602569349100465', '201602539349100610', '201602539349100100', '201602549349100000', '201602539349100005', '201602539349100200', '201602539349100105', '201602539349100800', '201602539349100615', '201622569349100067', '201622569349100057', '201602539349100205', '201602539349100605', '201602539349100600', '201602539349100110', '201602539349100500', '201602539349100810', '201602539349100415', '201602539349100410', '201602549349100500', '201602539349100405', '201602539349100000', '201602569349100300', '201602569349100415', '201642259349100034', '201632289349101218', '201632289349100243', '201632289349100218', '201632269349100003', '201632289349100838', '201632289349100038', '201632289349100523', '201632289349101158', '201632289349101043', '201632289349100048', '201632289349100408', '201632289349101668', '201632289349101213', '201632289349100333', '201632269349100203', '201632289349102003', '201632289349101658', '201632289349100933', '201632289349101513', '201632289349101018', '201632279349100403', '201642259349101004', '201642259349100509', '201632319349100708', '201612379349100116', '201612379349100411', '201612379349100606', '201612379349200001', '201612379349200231', '201612379349200616', '201612379349200706', '201612379349200711', '201612379349200721', '201612379349300016', '201612379349300036', '201612379349300126', '201612379349300226', '201612379349300301', '201612379349300331', '201612379349300401', '201612379349300606', '201612379349300631', '201612379349300716', '201612379349300766', '201612389349100501', '201612389349100506', '201612389349200006', '201612389349200126', '201612389349200206', '201612389349200401', '201612389349200606', '201612389349200726', '201612389349300001', '201612389349300016', '201612389349300031', '201612389349300121', '201612389349300141', '201612389349300206', '201612389349300411', '201612389349300501', '201612389349300541', '201612389349300631', '201642459349300959', '201642459349300939', '201642459349300969', '201642439349300724', '201642449349300534', '201632509349300238', '201632509349300003', '201632509349300023', '201642449349300544', '201642459349300949', '201642459349301054', '201642449349300104', '201642519349300709', '201642509349300384', '201642519349300319', '201642509349300219', '201642469349300624', '201642469349300204', '201632529349301013', '201612389349300816', '201612399349200316', '201612399349200506', '201612399349300011', '201612399349300041', '201612399349300211', '201612399349300426', '201612399349300516', '201612399349300626', '201612399349300726', '201612399349300736', '201612409349200401', '201612419349200301', '201612419349300051', '201612429349200131', '201612429349200146', '201612429349200201', '201612429349200216', '201612429349200511', '201612429349200716', '201612429349200751', '201612429349300111', '201612429349300576', '201612429349300586', '201612429349300591', '201612429349300596', '201612429349300691', '201612429349300721', '201612429349300811', '201612429349300966', '201612429349301051', '201612429349301161', '201612439349100201', '201612439349100306', '201612439349100606', '201612439349200401', '201612439349200516', '201612439349200621', '201612439349200631', '201602519349100005', '201602509349100165', '201602509349100060', '201632529349100603', '201642519349100314', '201642459349100104', '201632519349100508', '201602509349100000', '201602509349100210', '201602469349100505', '201622519349100512', '201612519349100311', '201642459349100659', '201602509349100310', '201642509349100359', '201622519349100507', '201622519349100702', '201612509349100356', '201602519349100205', '201602519349100315', '201642459349100654', '201602469349100205', '201642449349100409', '201642469349100709', '201602469349100310', '201602469349100010'] +object_ids_2017 = [ + "201612439349300006", + "201612439349300026", + "201612439349300341", + "201612439349300516", + "201612439349300546", + "201612439349300601", + "201612439349300621", + "201612439349300746", + "201612439349300861", + "201612449349100601", + "201612449349100706", + "201612449349200001", + "201612449349200101", + "201612449349200111", + "201612449349200121", + "201612449349200756", + "201612449349200761", + "201612449349200806", + "201612449349200906", + "201612449349300216", + "201612449349300336", + "201612449349300406", + "201612449349300606", + "201612449349300636", + "201612449349300756", + "201612449349300786", + "201612449349301061", + "201612449349301071", + "201612449349301076", + "201612459349100321", + "201612459349200111", + "201612459349200206", + "201612459349200321", + "201612459349200431", + "201612459349200506", + "201612459349200711", + "201612459349300241", + "201612459349300301", + "201632519349300958", + "201632469349300108", + "201632519349300313", + "201642509349300319", + "201642509349300704", + "201642509349300209", + "201642469349300244", + "201632529349300833", + "201632519349301153", + "201642519349300434", + "201602379349100415", + "201602379349100605", + "201602379349300230", + "201602379349300530", + "201602389349100605", + "201602389349300545", + "201602389349300845", + "201602399349200620", + "201602399349200715", + "201602399349300310", + "201602399349300330", + "201602399349300410", + "201602399349300605", + "201602399349300610", + "201602399349300625", + "201602399349300700", + "201602399349300725", + "201602399349300800", + "201602399349300900", + "201602399349300915", + "201602409349200000", + "201602409349200200", + "201602409349200700", + "201612459349300346", + "201612459349300426", + "201612459349300431", + "201612459349300536", + "201612459349300626", + "201612459349300816", + "201612459349300971", + "201612459349300976", + "201612459349301101", + "201612459349301111", + "201612469349200411", + "201612469349300206", + "201612469349300231", + "201612469349300426", + "201612469349300501", + "201612469349300601", + "201612509349300101", + "201612509349300166", + "201612509349300211", + "201612509349300356", + "201612509349300476", + "201612509349300501", + "201612509349300621", + "201612519349200021", + "201612519349200611", + "201612519349200736", + "201612519349300136", + "201612519349300141", + "201612519349300146", + "201612519349300206", + "201612519349300601", + "201612519349300766", + "201612519349300826", + "201612519349300901", + "201612519349300951", + "201612529349200701", + "201612529349300801", + "201612529349300836", + "201612529349301251", + "201622449349300427", + "201602429349200000", + "201602429349200100", + "201602429349200310", + "201602429349200615", + "201602429349200640", + "201602439349300045", + "201602439349300415", + "201602439349300535", + "201602449349300910", + "201602449349301055", + "201602449349301090", + "201602449349301405", + "201602459349300345", + "201602459349300405", + "201602459349300715", + "201602459349301055", + "201602509349300350", + "201602529349200405", + "201602529349301100", + "201602529349301155", + "201602589349100020", + "201602599349100130", + "201612539349100751", + "201602589349100610", + "201602589349100210", + "201622569349100422", + "201622579349100302", + "201602599349100000", + "201602589349100310", + "201602589349100515", + "201602599349100405", + "201602589349100120", + "201622569349100452", + "201622579349100117", + "201602589349100300", + "201602589349100205", + "201602599349100630", + "201602599349100310", + "201622569349100312", + "201602579349100515", + "201602589349100105", + "201602579349100705", + "201602569349100600", + "201622579349100017", + "201602149349301245", + "201602159349300625", + "201642089349300449", + "201642049349300519", + "201632099349301278", + "201632049349300243", + "201612099349301021", + "201641979349301154", + "201641599349300219", + "201631619349300243", + "201612109349301306", + "201642029349300864", + "201602149349300040", + "201602049349300800", + "201632089349300823", + "201622109349300627", + "201602079349301200", + "201632099349301353", + "201632079349301368", + "201632039349300953", + "201602159349300225", + "201602599349100435", + "201612539349100406", + "201602599349100030", + "201602599349100640", + "201602599349100430", + "201602599349100625", + "201602599349100205", + "201602579349100200", + "201602579349100115", + "201602579349100105", + "201602569349100085", + "201622539349100702", + "201622569349100122", + "201612549349100401", + "201612539349100001", + "201612539349100311", + "201622599349100022", + "201602589349100625", + "201602579349100715", + "201602579349100100", + "201602579349100005", + "201602539349200810", + "201602549349200005", + "201602539349200000", + "201602539349200510", + "201602539349200800", + "201602539349200230", + "201602539349200805", + "201602549349200600", + "201602539349200310", + "201622579349200247", + "201602539349200010", + "201602539349200135", + "201602539349200605", + "201602539349200005", + "201602549349200105", + "201602539349200600", + "201602549349200100", + "201602539349200525", + "201602539349200300", + "201602539349200520", + "201602539349200235", + "201602539349200710", + "201602539349200115", + "201602539349200515", + "201602599349100215", + "201612569349100466", + "201602589349100400", + "201612599349100301", + "201612599349100116", + "201602589349100810", + "201612569349100501", + "201602589349100005", + "201622539349100802", + "201612569349100401", + "201602599349100515", + "201602599349100210", + "201602589349100805", + "201602589349100000", + "201612589349100511", + "201622549349100002", + "201622539349100312", + "201612599349100136", + "201612579349100706", + "201612589349100421", + "201602579349100000", + "201602579349100210", + "201612589349100411", + "201612589349100406", + "201612589349100326", + "201602549349200700", + "201602539349200110", + "201602539349200225", + "201612599349100106", + "201612539349100616", + "201602599349100530", + "201622579349100702", + "201622539349100502", + "201622579349100412", + "201612549349100501", + "201612539349100401", + "201602589349100750", + "201602599349100135", + "201602589349100510", + "201602599349100420", + "201602589349100110", + "201602599349100535", + "201602589349100015", + "201622539349100757", + "201622549349100302", + "201622599349100002", + "201622579349100802", + "201622569349100617", + "201622539349100007", + "201622569349100512", + "201612599349100211", + "201612569349100616", + "201612569349200726", + "201612579349200761", + "201612599349200636", + "201612569349200621", + "201612599349200606", + "201612599349200131", + "201612589349200501", + "201622589349200342", + "201622589349200132", + "201612569349200136", + "201612599349200121", + "201612569349200011", + "201612569349200751", + "201612569349200001", + "201612579349200641", + "201612599349200711", + "201612599349201111", + "201612569349200616", + "201612599349200611", + "201612599349200541", + "201612589349200441", + "201642239349302074", + "201632249349301788", + "201632199349300108", + "201602289349302985", + "201602259349301770", + "201642239349302474", + "201602319349300950", + "201642249349303004", + "201632249349303053", + "201632249349301818", + "201642229349301129", + "201602259349303350", + "201602289349301880", + "201632229349302013", + "201622249349302207", + "201632229349300443", + "201642219349301344", + "201642229349300884", + "201642229349301644", + "201602289349303935", + "201632239349302363", + "201642249349301329", + "201602599349100635", + "201612539349100301", + "201602539349100760", + "201612589349100626", + "201612569349100606", + "201602599349100525", + "201602599349100400", + "201612569349100506", + "201622539349100767", + "201612569349100461", + "201602549349100400", + "201602579349100510", + "201602579349100800", + "201612539349100306", + "201602589349100710", + "201602599349100020", + "201602599349100010", + "201602589349100410", + "201612599349100201", + "201612599349100006", + "201612589349100601", + "201622589349100717", + "201612579349100511", + "201602189349300440", + "201642169349301589", + "201632289349203713", + "201632289349203468", + "201632289349203398", + "201632289349204603", + "201632289349203743", + "201632289349203373", + "201632289349203233", + "201632289349203158", + "201632289349202553", + "201632289349204143", + "201632289349204153", + "201632289349204003", + "201632289349204313", + "201632289349203663", + "201632289349203628", + "201632289349203563", + "201632289349202753", + "201632289349202633", + "201632319349200013", + "201632289349204248", + "201632289349203993", + "201632179349301418", + "201612229349301241", + "201602239349301585", + "201602189349300985", + "201602189349300760", + "201642169349300104", + "201612179349300641", + "201622159349300027", + "201642159349300334", + "201602239349302575", + "201602189349300865", + "201632169349300528", + "201622169349301667", + "201622159349300787", + "201602229349301885", + "201612149349301421", + "201642179349301574", + "201602169349300300", + "201602179349301510", + "201622119349301027", + "201642149349301339", + "201622159349301767", + "201612189349300641", + "201612249349300436", + "201612229349301486", + "201612219349301091", + "201602249349300445", + "201602229349301920", + "201602219349300525", + "201602239349302375", + "201602249349302260", + "201622229349301242", + "201612229349301306", + "201602199349300500", + "201622219349301032", + "201622189349300897", + "201602219349302255", + "201602189349300950", + "201602249349301185", + "201612229349300146", + "201612249349301301", + "201632289349203813", + "201632289349203808", + "201632289349203568", + "201632289349202108", + "201632289349204358", + "201632289349203223", + "201632289349202923", + "201642259349201179", + "201642259349200709", + "201642259349200424", + "201632319349200113", + "201632319349200033", + "201632299349201003", + "201632299349200933", + "201632289349204413", + "201632289349202683", + "201632289349202558", + "201632289349201388", + "201632289349201243", + "201632289349200923", + "201632269349200133", + "201642259349200549", + "201632309349200418", + "201632289349204273", + "201632289349204258", + "201632289349203703", + "201632289349202983", + "201632289349201408", + "201632289349201353", + "201632289349200228", + "201642289349202399", + "201642289349201969", + "201642289349201744", + "201642319349200329", + "201642289349202324", + "201642289349201379", + "201642289349203804", + "201642289349203564", + "201642289349202134", + "201642289349201074", + "201642319349200319", + "201642289349203174", + "201642289349202924", + "201642289349202874", + "201642289349201979", + "201642289349200829", + "201642319349200719", + "201612229349301736", + "201612189349301406", + "201622229349300527", + "201602239349301695", + "201612569349100081", + "201622569349100457", + "201612579349100001", + "201622539349100412", + "201612579349100206", + "201612599349100311", + "201612599349100506", + "201612599349100401", + "201612589349100206", + "201612589349100016", + "201622579349100312", + "201622579349100402", + "201602579349100610", + "201612569349100076", + "201612569349100006", + "201612569349100001", + "201622569349100072", + "201622559349100002", + "201612589349100211", + "201612589349100126", + "201602599349200910", + "201602569349200640", + "201602599349200745", + "201602539349200405", + "201602569349200635", + "201602579349200715", + "201602569349200860", + "201602559349200050", + "201602569349200135", + "201602569349200405", + "201602569349200000", + "201602569349200530", + "201602569349200240", + "201602569349200865", + "201602589349200120", + "201602589349200520", + "201602589349200300", + "201602589349201005", + "201602589349200725", + "201602589349200005", + "201602559349200000", + "201602569349200125", + "201602549349200500", + "201642289349203299", + "201642289349202499", + "201642289349201729", + "201632299349200913", + "201632279349200808", + "201632319349200038", + "201632289349202428", + "201632289349201613", + "201632269349200003", + "201632309349200718", + "201632299349200903", + "201632289349202593", + "201642259349201084", + "201642259349200719", + "201632319349200128", + "201632309349200438", + "201632289349200938", + "201632289349200853", + "201632269349200613", + "201642259349200334", + "201632289349202218", + "201632289349201508", + "201632289349200208", + "201632309349200528", + "201632289349202408", + "201632289349201643", + "201632289349200413", + "201632289349204608", + "201632289349202943", + "201632289349202363", + "201632289349202008", + "201632289349201988", + "201632289349201773", + "201632289349201578", + "201632289349200003", + "201632289349204233", + "201632289349203433", + "201632289349200433", + "201632289349202213", + "201632289349200223", + "201632289349204103", + "201632289349203348", + "201632289349204238", + "201632289349203138", + "201632289349201733", + "201632289349203003", + "201632289349202803", + "201632289349204378", + "201602599349200945", + "201602599349200920", + "201602599349200325", + "201602309349301000", + "201602299349300525", + "201632259349301783", + "201602289349305800", + "201612259349302576", + "201602289349306480", + "201602289349303740", + "201602079349300950", + "201612089349301256", + "201612109349301301", + "201642039349300014", + "201632079349301123", + "201642019349300124", + "201641619349300539", + "201642089349301274", + "201632099349300403", + "201602289349305745", + "201612259349303741", + "201612259349302571", + "201612259349301956", + "201602259349301840", + "201642229349301139", + "201602289349305390", + "201632259349301883", + "201622569349100052", + "201612599349100141", + "201612549349100001", + "201612589349100021", + "201612579349100011", + "201612599349100411", + "201622569349100132", + "201622589349100422", + "201622589349100507", + "201602599349100510", + "201602599349100140", + "201602599349100105", + "201622569349100302", + "201622569349100002", + "201622569349100082", + "201622569349100077", + "201622569349100612", + "201612599349100216", + "201612599349100121", + "201612579349100016", + "201632289349200143", + "201632269349200603", + "201631609349200708", + "201642179349200249", + "201642179349200439", + "201602289349202390", + "201602259349201435", + "201612049349200001", + "201612149349200121", + "201612169349200301", + "201612219349200731", + "201612249349202001", + "201602239349200910", + "201632289349201078", + "201632289349201113", + "201632279349200038", + "201632289349202003", + "201632289349201973", + "201632289349201143", + "201632289349200028", + "201632279349200223", + "201632279349200713", + "201632269349200233", + "201632239349300138", + "201622249349302802", + "201622249349301907", + "201642229349301629", + "201632249349301433", + "201632229349301808", + "201642229349301344", + "201602289349301630", + "201642249349301799", + "201642239349301989", + "201642249349302259", + "201642249349300714", + "201602259349302660", + "201632219349300933", + "201632229349301128", + "201622249349303002", + "201632229349301373", + "201642219349302219", + "201632249349302148", + "201622229349301197", + "201622249349301172", + "201612569349100086", + "201632289349102063", + "201632289349100543", + "201632289349100508", + "201632289349101548", + "201632289349100748", + "201632319349100103", + "201632289349100003", + "201622239349301852", + "201632289349303273", + "201632289349305258", + "201632289349306863", + "201632289349303708", + "201632289349306598", + "201632289349306363", + "201632289349302613", + "201632289349301978", + "201632319349300748", + "201632289349305448", + "201632299349300208", + "201632289349304968", + "201632289349306638", + "201632289349307128", + "201632289349305023", + "201632289349304128", + "201632289349305693", + "201632289349303163", + "201632289349303643", + "201632289349301418", + "201622579349100512", + "201612539349100811", + "201622589349100512", + "201622569349100467", + "201622569349100307", + "201622589349100407", + "201622589349100002", + "201622579349100507", + "201622569349100117", + "201622589349100427", + "201602579349100600", + "201602579349100400", + "201622559349100152", + "201622539349100407", + "201612539349100851", + "201622569349100112", + "201622569349100102", + "201602599349100425", + "201622579349100012", + "201622589349100807", + "201622569349100552", + "201622589349100417", + "201622589349100327", + "201622589349100207", + "201622579349100307", + "201612329349100001", + "201612329349100501", + "201612329349100506", + "201612329349100706", + "201612329349200006", + "201612329349200226", + "201612329349200306", + "201612329349200526", + "201612329349200801", + "201612329349300301", + "201632289349101028", + "201632289349101583", + "201632289349102008", + "201632289349100418", + "201642259349100004", + "201632299349100303", + "201632309349100303", + "201642259349100544", + "201632299349100003", + "201642259349101114", + "201632289349100938", + "201632289349100623", + "201632289349101358", + "201632289349101918", + "201632289349101198", + "201632289349100913", + "201632289349101578", + "201632289349100733", + "201632269349100403", + "201642289349101929", + "201642289349100929", + "201642289349101909", + "201642289349101564", + "201602329349100705", + "201642289349101289", + "201642269349100304", + "201642289349101274", + "201602329349100805", + "201602339349100100", + "201642299349100714", + "201642299349100709", + "201642289349100944", + "201642289349101879", + "201642289349101519", + "201642289349100734", + "201642289349100414", + "201642289349101689", + "201642289349100224", + "201642289349100349", + "201642289349100214", + "201642229349301184", + "201642249349301549", + "201632249349300013", + "201642229349300839", + "201642229349300909", + "201632249349302078", + "201632249349300513", + "201632249349300503", + "201632249349301498", + "201642249349302379", + "201642249349302039", + "201642249349301544", + "201642249349300729", + "201642249349301614", + "201642229349301239", + "201642229349301324", + "201642229349301054", + "201642229349301179", + "201642219349301764", + "201642219349302074", + "201642219349301564", + "201602579349300515", + "201612579349300236", + "201602569349300880", + "201602569349300420", + "201602579349300825", + "201602569349300315", + "201602579349300310", + "201602579349300315", + "201602579349301210", + "201602569349300900", + "201602569349300785", + "201602559349300155", + "201602569349300110", + "201602559349300350", + "201612539349300006", + "201612589349301136", + "201602579349301265", + "201602579349300520", + "201602579349300855", + "201602579349300400", + "201602579349300895", + "201602579349300720", + "201602579349300885", + "201602579349300820", + "201602569349301265", + "201602569349301000", + "201612329349300631", + "201612329349300646", + "201612329349301101", + "201612329349301151", + "201612329349301206", + "201612339349100201", + "201612339349200301", + "201612339349200501", + "201612349349200251", + "201612359349100351", + "201612359349100361", + "201612359349100411", + "201612359349200041", + "201612359349200256", + "201612359349200501", + "201612359349300216", + "201612359349300461", + "201612359349300501", + "201612359349300721", + "201612359349300901", + "201612359349301006", + "201612369349100756", + "201612369349200111", + "201612369349200406", + "201612369349200416", + "201612369349200531", + "201612369349300001", + "201612369349300011", + "201612369349300201", + "201612369349300301", + "201612369349300336", + "201612369349300346", + "201612369349300506", + "201612369349300611", + "201612369349300726", + "201612369349300906", + "201612369349301156", + "201612379349100001", + "201602539349100765", + "201602539349100210", + "201602539349100805", + "201602569349100465", + "201602539349100610", + "201602539349100100", + "201602549349100000", + "201602539349100005", + "201602539349100200", + "201602539349100105", + "201602539349100800", + "201602539349100615", + "201622569349100067", + "201622569349100057", + "201602539349100205", + "201602539349100605", + "201602539349100600", + "201602539349100110", + "201602539349100500", + "201602539349100810", + "201602539349100415", + "201602539349100410", + "201602549349100500", + "201602539349100405", + "201602539349100000", + "201602569349100300", + "201602569349100415", + "201642259349100034", + "201632289349101218", + "201632289349100243", + "201632289349100218", + "201632269349100003", + "201632289349100838", + "201632289349100038", + "201632289349100523", + "201632289349101158", + "201632289349101043", + "201632289349100048", + "201632289349100408", + "201632289349101668", + "201632289349101213", + "201632289349100333", + "201632269349100203", + "201632289349102003", + "201632289349101658", + "201632289349100933", + "201632289349101513", + "201632289349101018", + "201632279349100403", + "201642259349101004", + "201642259349100509", + "201632319349100708", + "201612379349100116", + "201612379349100411", + "201612379349100606", + "201612379349200001", + "201612379349200231", + "201612379349200616", + "201612379349200706", + "201612379349200711", + "201612379349200721", + "201612379349300016", + "201612379349300036", + "201612379349300126", + "201612379349300226", + "201612379349300301", + "201612379349300331", + "201612379349300401", + "201612379349300606", + "201612379349300631", + "201612379349300716", + "201612379349300766", + "201612389349100501", + "201612389349100506", + "201612389349200006", + "201612389349200126", + "201612389349200206", + "201612389349200401", + "201612389349200606", + "201612389349200726", + "201612389349300001", + "201612389349300016", + "201612389349300031", + "201612389349300121", + "201612389349300141", + "201612389349300206", + "201612389349300411", + "201612389349300501", + "201612389349300541", + "201612389349300631", + "201642459349300959", + "201642459349300939", + "201642459349300969", + "201642439349300724", + "201642449349300534", + "201632509349300238", + "201632509349300003", + "201632509349300023", + "201642449349300544", + "201642459349300949", + "201642459349301054", + "201642449349300104", + "201642519349300709", + "201642509349300384", + "201642519349300319", + "201642509349300219", + "201642469349300624", + "201642469349300204", + "201632529349301013", + "201612389349300816", + "201612399349200316", + "201612399349200506", + "201612399349300011", + "201612399349300041", + "201612399349300211", + "201612399349300426", + "201612399349300516", + "201612399349300626", + "201612399349300726", + "201612399349300736", + "201612409349200401", + "201612419349200301", + "201612419349300051", + "201612429349200131", + "201612429349200146", + "201612429349200201", + "201612429349200216", + "201612429349200511", + "201612429349200716", + "201612429349200751", + "201612429349300111", + "201612429349300576", + "201612429349300586", + "201612429349300591", + "201612429349300596", + "201612429349300691", + "201612429349300721", + "201612429349300811", + "201612429349300966", + "201612429349301051", + "201612429349301161", + "201612439349100201", + "201612439349100306", + "201612439349100606", + "201612439349200401", + "201612439349200516", + "201612439349200621", + "201612439349200631", + "201602519349100005", + "201602509349100165", + "201602509349100060", + "201632529349100603", + "201642519349100314", + "201642459349100104", + "201632519349100508", + "201602509349100000", + "201602509349100210", + "201602469349100505", + "201622519349100512", + "201612519349100311", + "201642459349100659", + "201602509349100310", + "201642509349100359", + "201622519349100507", + "201622519349100702", + "201612509349100356", + "201602519349100205", + "201602519349100315", + "201642459349100654", + "201602469349100205", + "201642449349100409", + "201642469349100709", + "201602469349100310", + "201602469349100010", +] -#csvcut -c 9 index_2016.csv | head -n 1000 > returns_2016.txt +# csvcut -c 9 index_2016.csv | head -n 1000 > returns_2016.txt -object_ids_2016 = ['201543159349100344', '201543109349200219', '201513089349200226', '201513089349200236', '201523229349300327', '201543089349301829', '201533179349306298', '201533179349201108', '201533179349203783', '201533209349304768', '201533179349307343', '201533209349204083', '201533209349204123', '201533209349204128', '201533209349204148', '201533209349204153', '201533209349204178', '201533209349204198', '201533209349204208', '201533209349204223', '201533209349204228', '201533189349300608', '201523069349301367', '201533069349300963', '201523099349300542', '201533099349301033', '201533099349301043', '201523169349304367', '201533099349301803', '201523069349300142', '201533109349300348', '201503069349100380', '201513089349100601', '201523039349200407', '201543039349301204', '201523039349200632', '201523039349200637', '201523089349301462', '201533069349300788', '201533079349300238', '201543149349201279', '201543159349100504', '201543169349201334', '201543169349201349', '201543109349200229', '201533169349100748', '201533169349100808', '201513069349200601', '201523209349314227', '201523209349314257', '201523209349311332', '201533179349302173', '201533179349307048', '201523219349200632', '201533179349201623', '201533179349201643', '201543109349100104', '201533209349302633', '201533179349200538', '201533179349200618', '201533179349203683', '201533179349203728', '201533209349306188', '201533209349204843', '201533099349301103', '201533099349301113', '201523039349300127', '201523079349301652', '201533039349300813', '201533139349300148', '201533139349300208', '201533069349301413', '201533079349300003', '201523039349200827', '201523079349200027', '201523079349200237', '201523069349300957', '201523079349301387', '201533079349200823', '201523209349310937', '201523209349310947', '201543089349201054', '201533179349306528', '201533179349303278', '201543079349200609', '201543079349200529', '201533179349306278', '201523099349201102', '201523239349300002', '201533209349205278', '201533209349205353', '201533209349201488', '201533209349203893', '201533209349203908', '201533209349203913', '201533209349203923', '201533209349201753', '201533209349201808', '201533209349302303', '201533179349307818', '201533179349307828', '201523209349311892', '201533179349309453', '201533209349301728', '201533209349301738', '201533189349100703', '201533209349102838', '201533209349102858', '201533209349101368', '201533209349101373', '201533179349307538', '201533209349203328', '201533209349203503', '201533209349306423', '201533209349306438', '201533209349203508', '201533209349203518', '201533179349307838', '201533189349300223', '201533179349309083', '201533189349300233', '201543099349200889', '201503099349201105', '201513079349201106', '201513089349200936', '201513089349100726', '201523069349301172', '201533069349300408', '201543169349201379', '201523209349314732', '201523209349313972', '201533179349202718', '201533179349202668', '201533179349202733', '201533099349200108', '201523209349311802', '201533209349205593', '201533209349101563', '201533179349308913', '201533209349102818', '201533179349309088', '201533189349300443', '201533209349101958', '201533209349206523', '201533179349309208', '201533209349305433', '201533209349305438', '201533209349206623', '201533179349309223', '201533179349309268', '201533209349102023', '201533209349102028', '201533179349309348', '201533199349100413', '201533209349102278', '201543099349301839', '201503069349200845', '201513069349200231', '201503069349200970', '201503069349200980', '201513069349200316', '201513099349201151', '201513079349100106', '201513099349100201', '201523069349301047', '201523069349301057', '201523069349301112', '201523069349301117', '201523069349301127', '201523069349301132', '201523069349301157', '201503039349100615', '201523089349300312', '201523089349300317', '201523089349300322', '201533069349300508', '201533069349300708', '201533069349300718', '201533069349300778', '201533069349300803', '201543159349200314', '201543159349200319', '201543089349301374', '201523209349316527', '201533169349101073', '201543069349100569', '201533179349305958', '201523229349300237', '201533179349303308', '201533179349305888', '201533179349305893', '201533179349305903', '201533179349305923', '201533179349306913', '201543079349301154', '201523219349200727', '201523219349200747', '201543109349200224', '201543109349200234', '201543109349200304', '201533169349100833', '201533069349300443', '201533069349300868', '201533179349301803', '201523209349311197', '201523069349100357', '201533179349305948', '201523229349300307', '201523209349314597', '201523209349314607', '201523209349315117', '201523209349312647', '201523209349312657', '201523209349310667', '201523219349301067', '201523209349313657', '201533179349201693', '201543109349100329', '201523219349100207', '201533179349100403', '201533179349100418', '201533179349100508', '201533179349100513', '201523219349200132', '201533179349200718', '201533209349302068', '201533179349100823', '201533179349100938', '201533179349101128', '201533179349101688', '201533179349101693', '201533179349307523', '201533209349306058', '201543069349301474', '201543149349201919', '201543149349202034', '201543089349200014', '201543099349201214', '201543159349100614', '201543159349100629', '201543149349201549', '201513089349201211', '201503089349201250', '201513099349100036', '201513099349100041', '201523069349301752', '201543139349101009', '201523229349300507', '201543109349100009', '201533209349302098', '201533209349301008', '201533209349301023', '201533179349203653', '201533179349203853', '201533179349203858', '201533179349308653', '201533179349203888', '201533189349200133', '201533179349102213', '201533179349102453', '201523069349301392', '201533099349302063', '201533099349302068', '201533099349302103', '201523079349300547', '201523079349300607', '201523079349300707', '201523039349301092', '201533099349301028', '201523039349301227', '201533089349300648', '201513089349100511', '201503099349201100', '201543089349100729', '201523229349300122', '201523229349300142', '201533179349303898', '201533179349303908', '201533179349303913', '201523209349310247', '201533179349300928', '201523209349314977', '201533179349302118', '201523209349313352', '201523209349313387', '201523209349313392', '201533179349303713', '201533179349303723', '201533179349302818', '201523229349200117', '201533179349201233', '201523219349200142', '201523219349200147', '201523219349200207', '201523039349300942', '201523039349300977', '201523039349300982', '201533039349300948', '201543039349300044', '201543039349301154', '201533079349300213', '201533089349300538', '201533079349300328', '201533089349200978', '201523079349201057', '201533039349200308', '201533099349301313', '201523079349200447', '201523079349200627', '201533079349301248', '201533079349301328', '201533049349300403', '201543069349201204', '201543169349201579', '201543109349200424', '201523209349311052', '201523239349300612', '201523209349312367', '201543109349100014', '201543109349100124', '201533099349301538', '201523089349301817', '201523089349301927', '201533089349300633', '201523069349200307', '201533069349200328', '201533069349200338', '201533069349200428', '201533069349200433', '201523069349201167', '201523069349201202', '201523069349201257', '201523069349201302', '201533209349302198', '201523239349300107', '201533209349203173', '201523239349300202', '201533209349206183', '201533209349206208', '201533179349300543', '201533209349204003', '201533199349200413', '201533199349200648', '201533209349205633', '201533209349205643', '201533209349302503', '201523209349311932', '201533179349308283', '201533179349308293', '201533179349308308', '201533209349202273', '201533209349303908', '201533209349306913', '201533209349103568', '201543159349200914', '201543159349200919', '201543099349200334', '201543099349200624', '201523239349300307', '201533179349303018', '201533179349306963', '201533089349100613', '201533089349100623', '201533179349306108', '201533179349203403', '201533209349301973', '201533179349101908', '201533209349304733', '201533189349200148', '201533189349200213', '201533189349200223', '201533209349201793', '201523049349300007', '201523049349300107', '201533099349301308', '201503099349300215', '201503099349300225', '201533109349300523', '201533089349301458', '201523099349300347', '201543039349200339', '201523069349200962', '201523069349201002', '201543079349301354', '201533209349203958', '201533209349203963', '201533209349203978', '201533209349203983', '201533209349205873', '201533209349103663', '201533209349103678', '201533209349103238', '201533189349100513', '201503069349201110', '201503069349201115', '201503069349201150', '201503069349201165', '201503069349201215', '201503039349200340', '201513069349200956', '201513099349201116', '201523069349301482', '201543169349201394', '201543169349201369', '201543139349100604', '201523229349300047', '201523209349310187', '201523209349310192', '201523209349312807', '201523209349314007', '201533089349100528', '201523209349314152', '201523219349200627', '201523219349200712', '201523219349200732', '201523219349200802', '201523219349200907', '201533179349202178', '201533179349203378', '201533179349203398', '201533209349300603', '201533209349300608', '201533209349300618', '201533209349300623', '201523069349301327', '201523069349300707', '201523089349301977', '201533039349300038', '201513089349100326', '201523089349301827', '201533089349200923', '201523079349301177', '201533039349300823', '201533039349300828', '201543159349101159', '201543139349100014', '201543139349100024', '201543139349100029', '201513089349201101', '201513089349201106', '201513089349201116', '201513089349201156', '201513089349201161', '201513069349200331', '201503079349200795', '201503089349100030', '201543089349301464', '201533179349303958', '201533179349303998', '201523209349312622', '201533179349101313', '201533209349201068', '201523099349200307', '201523069349200337', '201523219349201152', '201533179349202388', '201533179349202413', '201533179349202453', '201523209349314792', '201543079349301174', '201543079349301194', '201543079349301214', '201543079349301309', '201533209349205268', '201533209349201613', '201533209349204028', '201533209349204038', '201533209349204053', '201533209349305963', '201533179349308908', '201533199349300033', '201533209349101623', '201533209349101633', '201533209349101643', '201543119349200204', '201543119349200304', '201543159349200519', '201543159349200529', '201543149349201779', '201543159349200609', '201543149349201799', '201543159349200614', '201543159349200704', '201543159349200734', '201543159349200749', '201543149349201439', '201543159349100804', '201543159349100809', '201543099349200234', '201543099349200339', '201503099349200960', '201513039349100006', '201523209349308842', '201523209349308857', '201533179349306558', '201543079349300119', '201533179349302383', '201533179349302388', '201533179349302413', '201533179349302423', '201533179349101993', '201533179349101998', '201533209349304753', '201533179349305163', '201533209349305143', '201533209349305178', '201533189349300433', '201523089349301007', '201523099349301812', '201533099349301168', '201533099349301173', '201523059349300002', '201523059349300007', '201523209349310582', '201523219349300902', '201523219349300927', '201523219349300932', '201533179349303218', '201543079349200724', '201533179349303963', '201533179349201488', '201533179349201513', '201523209349315257', '201533209349301013', '201523069349300737', '201523039349300142', '201523089349301962', '201523039349300912', '201523089349300147', '201523099349301322', '201533089349200928', '201523079349301522', '201523079349301562', '201533079349300603', '201533209349304958', '201533209349202433', '201533209349103903', '201533209349307203', '201533079349200308', '201503089349100620', '201533089349301298', '201543149349303884', '201513069349100376', '201543159349302009', '201543169349301754', '201543159349302049', '201533089349300118', '201543089349200419', '201543169349301814', '201543099349200014', '201543099349200239', '201543159349200949', '201543149349201409', '201543149349201429', '201543099349201109', '201543109349200404', '201543139349100044', '201543139349100104', '201543139349100109', '201523209349313967', '201523209349313982', '201523039349301162', '201533039349301308', '201523099349301372', '201533099349300408', '201533099349300433', '201533099349300513', '201533099349300623', '201523099349301587', '201533079349300708', '201533089349301623', '201533079349301718', '201533099349300003', '201533099349300008', '201533089349301943', '201533089349301953', '201523099349200332', '201523099349200402', '201533209349304158', '201533179349202848', '201533179349202858', '201533179349202873', '201533179349202903', '201533209349202243', '201533209349202258', '201533209349202298', '201533209349202313', '201533209349202343', '201513079349200921', '201523209349310107', '201523209349313822', '201533179349304353', '201533179349200728', '201523209349315237', '201533209349200208', '201533209349201173', '201533209349201178', '201533209349302323', '201523209349311942', '201533209349306063', '201533209349307733', '201533179349302083', '201533179349302108', '201533209349309743', '201533209349310963', '201533209349310968', '201533209349310998', '201533209349310378', '201533209349308923', '201533209349310413', '201533209349311033', '201533209349308968', '201533209349310418', '201533209349311133', '201533209349314503', '201543179349306014', '201533209349309328', '201533209349314523', '201533209349314548', '201533209349314553', '201533209349310808', '201533209349314683', '201533209349314693', '201533209349310838', '201533209349314718', '201533209349314723', '201503089349200640', '201513099349200901', '201513079349200321', '201503069349200425', '201503069349200430', '201503069349200515', '201503069349200530', '201503069349200755', '201503069349200900', '201503099349200145', '201503099349200210', '201513079349200541', '201513089349200131', '201513089349200206', '201513079349100601', '201523089349300307', '201523209349308707', '201523209349311307', '201523209349314277', '201523069349100212', '201523209349310987', '201543089349201069', '201523209349311582', '201523209349311657', '201523039349100802', '201523079349100007', '201543089349301989', '201533179349201188', '201533209349300923', '201533179349203573', '201533209349306638', '201523099349301782', '201523099349301787', '201523089349300032', '201503069349300805', '201503069349300815', '201523039349301212', '201523099349300047', '201523099349300112', '201533139349300103', '201533089349300748', '201543099349301914', '201503069349201300', '201543089349301469', '201533179349301748', '201533179349301758', '201523209349313117', '201523209349315172', '201523209349315217', '201523209349314172', '201533179349302358', '201533179349302363', '201523239349100202', '201523239349100307', '201523239349100402', '201533079349300148', '201533089349300513', '201543039349300509', '201543039349300519', '201523069349300892', '201533039349200338', '201533039349200343', '201533039349200418', '201543039349300624', '201523069349300217', '201523069349300247', '201523069349300432', '201523099349300237', '201533089349300048', '201533079349200303', '201533079349200403', '201523069349200972', '201533209349205148', '201533209349206148', '201533209349201633', '201533209349201643', '201533199349200318', '201533209349201398', '201533209349101543', '201533189349100603', '201533209349306018', '201533199349100708', '201533199349100718', '201533069349300238', '201533179349306763', '201523209349312602', '201523209349312607', '201533179349301013', '201533179349303838', '201543089349301684', '201543089349301719', '201543089349301724', '201543089349301814', '201543089349301909', '201543089349301924', '201543089349301949', '201523239349200117', '201533209349307668', '201533209349307673', '201533179349203813', '201533209349204963', '201533209349204988', '201533179349202458', '201533209349102443', '201533179349309068', '201533209349300428', '201533199349301023', '201533199349301028', '201533179349309583', '201533209349307183', '201533179349306873', '201533069349200213', '201533069349200233', '201543069349201104', '201543159349100639', '201543149349201534', '201543139349100004', '201543139349100034', '201523209349308717', '201543089349200729', '201523209349316562', '201523209349313147', '201533179349302053', '201533169349101053', '201533179349306148', '201543079349300814', '201523209349312297', '201523209349312322', '201523209349312327', '201523209349312337', '201533179349201983', '201523099349301962', '201533089349301428', '201523069349300127', '201523219349201107', '201533179349202373', '201533209349302993', '201533209349103268', '201533209349103288', '201533209349306393', '201533179349309508', '201533199349301408', '201533179349303488', '201533179349305663', '201533179349305693', '201533179349305708', '201533179349306808', '201533069349200343', '201543089349200229', '201543089349200244', '201543089349200249', '201543099349201204', '201543169349201574', '201543139349100419', '201523209349314662', '201523209349314712', '201523209349313887', '201533179349303978', '201523209349312442', '201533179349201388', '201533179349202838', '201523219349200307', '201523209349312307', '201523229349100702', '201533209349304743', '201533209349204813', '201523069349300797', '201523069349300812', '201523069349300827', '201503099349300235', '201533089349301583', '201533089349300433', '201533079349301213', '201533079349301223', '201533209349204298', '201533209349100828', '201533209349206463', '201533209349206818', '201533099349200443', '201533099349200518', '201523089349200247', '201523099349200437', '201523089349200507', '201523089349200512', '201523089349200547', '201523039349200832', '201533089349200738', '201533169349100923', '201533179349303518', '201543069349100369', '201543069349100379', '201543069349100414', '201543069349100524', '201543079349200629', '201533179349307778', '201533209349302523', '201533209349307303', '201533209349307333', '201533209349307363', '201523239349300862', '201533209349303283', '201533209349303298', '201533209349303303', '201533209349303773', '201523099349200432', '201523079349300612', '201503089349100510', '201533039349300213', '201513099349300511', '201543159349100124', '201543159349100134', '201543159349100139', '201543149349100034', '201543149349100049', '201543149349303704', '201543149349303804', '201543149349101224', '201543169349301619', '201533159349304133', '201543039349100624', '201543039349100629', '201533099349100038', '201533089349300043', '201543089349200044', '201543089349200104', '201543089349200119', '201543089349200139', '201543069349200314', '201523209349310152', '201523209349313832', '201533179349201673', '201533179349201678', '201533179349201723', '201543109349100404', '201523209349312222', '201523209349312232', '201533209349303123', '201533209349303163', '201533179349307363', '201533209349302293', '201533209349204648', '201533209349205588', '201533209349307383', '201533179349202793', '201533179349202843', '201533179349202853', '201533179349202863', '201533179349308278', '201533179349308328', '201533209349203058', '201533209349102923', '201533209349206783', '201533209349303308', '201533209349303358', '201533209349303388', '201533209349303708', '201533209349303418', '201503069349200810', '201503069349200830', '201503069349200990', '201513089349200511', '201513089349200526', '201543079349300739', '201543079349300834', '201523209349313357', '201523209349312217', '201523239349200227', '201523239349200312', '201533179349200708', '201533179349300308', '201533179349300323', '201533179349300333', '201533179349300338', '201533209349302678', '201533179349203833', '201523209349311832', '201533209349103233', '201533209349101553', '201533209349306053', '201503039349300600', '201543159349100039', '201543169349301719', '201523089349100002', '201523089349100017', '201523089349100022', '201523089349100207', '201533099349100613', '201533209349310883', '201533209349310893', '201533209349310923', '201533209349310928', '201543169349201529', '201543149349202024', '201543159349201019', '201543159349200724', '201543109349200209', '201543109349200314', '201543109349200504', '201543109349200434', '201543099349200224', '201543099349200429', '201543099349301959', '201513079349201066', '201513079349201101', '201513039349200211', '201503099349100735', '201533069349300143', '201533069349300618', '201523099349301612', '201543139349100749', '201543139349100754', '201543139349100764', '201523209349308752', '201523209349308762', '201523209349310132', '201523209349308852', '201523209349311232', '201523069349100322', '201523069349100327', '201543079349100204', '201523209349313587', '201533179349305998', '201543099349100009', '201523239349100407', '201523239349100502', '201533179349102208', '201523099349301772', '201533099349301158', '201503099349300130', '201523079349301702', '201523079349301712', '201543069349201114', '201543069349201304', '201543159349200134', '201523209349312642', '201533179349306993', '201533169349100218', '201533179349201218', '201533179349203553', '201523209349312382', '201533209349301128', '201533179349305138', '201523099349300012', '201543039349301004', '201533109349300648', '201533039349300848', '201543039349301109', '201523079349301527', '201523079349200032', '201533089349200743', '201543039349300544', '201533069349301108', '201533089349300023', '201533039349200213', '201533079349300818', '201533039349300923', '201523099349200737', '201533079349300903', '201523069349200342', '201533209349202168', '201533209349202178', '201533209349302238', '201533209349201453', '201533209349201548', '201523209349312842', '201533209349201278', '201533209349201418', '201533199349300638', '201533209349306133', '201533209349103323', '201533209349103328', '201543159349200419', '201523209349313022'] +object_ids_2016 = [ + "201543159349100344", + "201543109349200219", + "201513089349200226", + "201513089349200236", + "201523229349300327", + "201543089349301829", + "201533179349306298", + "201533179349201108", + "201533179349203783", + "201533209349304768", + "201533179349307343", + "201533209349204083", + "201533209349204123", + "201533209349204128", + "201533209349204148", + "201533209349204153", + "201533209349204178", + "201533209349204198", + "201533209349204208", + "201533209349204223", + "201533209349204228", + "201533189349300608", + "201523069349301367", + "201533069349300963", + "201523099349300542", + "201533099349301033", + "201533099349301043", + "201523169349304367", + "201533099349301803", + "201523069349300142", + "201533109349300348", + "201503069349100380", + "201513089349100601", + "201523039349200407", + "201543039349301204", + "201523039349200632", + "201523039349200637", + "201523089349301462", + "201533069349300788", + "201533079349300238", + "201543149349201279", + "201543159349100504", + "201543169349201334", + "201543169349201349", + "201543109349200229", + "201533169349100748", + "201533169349100808", + "201513069349200601", + "201523209349314227", + "201523209349314257", + "201523209349311332", + "201533179349302173", + "201533179349307048", + "201523219349200632", + "201533179349201623", + "201533179349201643", + "201543109349100104", + "201533209349302633", + "201533179349200538", + "201533179349200618", + "201533179349203683", + "201533179349203728", + "201533209349306188", + "201533209349204843", + "201533099349301103", + "201533099349301113", + "201523039349300127", + "201523079349301652", + "201533039349300813", + "201533139349300148", + "201533139349300208", + "201533069349301413", + "201533079349300003", + "201523039349200827", + "201523079349200027", + "201523079349200237", + "201523069349300957", + "201523079349301387", + "201533079349200823", + "201523209349310937", + "201523209349310947", + "201543089349201054", + "201533179349306528", + "201533179349303278", + "201543079349200609", + "201543079349200529", + "201533179349306278", + "201523099349201102", + "201523239349300002", + "201533209349205278", + "201533209349205353", + "201533209349201488", + "201533209349203893", + "201533209349203908", + "201533209349203913", + "201533209349203923", + "201533209349201753", + "201533209349201808", + "201533209349302303", + "201533179349307818", + "201533179349307828", + "201523209349311892", + "201533179349309453", + "201533209349301728", + "201533209349301738", + "201533189349100703", + "201533209349102838", + "201533209349102858", + "201533209349101368", + "201533209349101373", + "201533179349307538", + "201533209349203328", + "201533209349203503", + "201533209349306423", + "201533209349306438", + "201533209349203508", + "201533209349203518", + "201533179349307838", + "201533189349300223", + "201533179349309083", + "201533189349300233", + "201543099349200889", + "201503099349201105", + "201513079349201106", + "201513089349200936", + "201513089349100726", + "201523069349301172", + "201533069349300408", + "201543169349201379", + "201523209349314732", + "201523209349313972", + "201533179349202718", + "201533179349202668", + "201533179349202733", + "201533099349200108", + "201523209349311802", + "201533209349205593", + "201533209349101563", + "201533179349308913", + "201533209349102818", + "201533179349309088", + "201533189349300443", + "201533209349101958", + "201533209349206523", + "201533179349309208", + "201533209349305433", + "201533209349305438", + "201533209349206623", + "201533179349309223", + "201533179349309268", + "201533209349102023", + "201533209349102028", + "201533179349309348", + "201533199349100413", + "201533209349102278", + "201543099349301839", + "201503069349200845", + "201513069349200231", + "201503069349200970", + "201503069349200980", + "201513069349200316", + "201513099349201151", + "201513079349100106", + "201513099349100201", + "201523069349301047", + "201523069349301057", + "201523069349301112", + "201523069349301117", + "201523069349301127", + "201523069349301132", + "201523069349301157", + "201503039349100615", + "201523089349300312", + "201523089349300317", + "201523089349300322", + "201533069349300508", + "201533069349300708", + "201533069349300718", + "201533069349300778", + "201533069349300803", + "201543159349200314", + "201543159349200319", + "201543089349301374", + "201523209349316527", + "201533169349101073", + "201543069349100569", + "201533179349305958", + "201523229349300237", + "201533179349303308", + "201533179349305888", + "201533179349305893", + "201533179349305903", + "201533179349305923", + "201533179349306913", + "201543079349301154", + "201523219349200727", + "201523219349200747", + "201543109349200224", + "201543109349200234", + "201543109349200304", + "201533169349100833", + "201533069349300443", + "201533069349300868", + "201533179349301803", + "201523209349311197", + "201523069349100357", + "201533179349305948", + "201523229349300307", + "201523209349314597", + "201523209349314607", + "201523209349315117", + "201523209349312647", + "201523209349312657", + "201523209349310667", + "201523219349301067", + "201523209349313657", + "201533179349201693", + "201543109349100329", + "201523219349100207", + "201533179349100403", + "201533179349100418", + "201533179349100508", + "201533179349100513", + "201523219349200132", + "201533179349200718", + "201533209349302068", + "201533179349100823", + "201533179349100938", + "201533179349101128", + "201533179349101688", + "201533179349101693", + "201533179349307523", + "201533209349306058", + "201543069349301474", + "201543149349201919", + "201543149349202034", + "201543089349200014", + "201543099349201214", + "201543159349100614", + "201543159349100629", + "201543149349201549", + "201513089349201211", + "201503089349201250", + "201513099349100036", + "201513099349100041", + "201523069349301752", + "201543139349101009", + "201523229349300507", + "201543109349100009", + "201533209349302098", + "201533209349301008", + "201533209349301023", + "201533179349203653", + "201533179349203853", + "201533179349203858", + "201533179349308653", + "201533179349203888", + "201533189349200133", + "201533179349102213", + "201533179349102453", + "201523069349301392", + "201533099349302063", + "201533099349302068", + "201533099349302103", + "201523079349300547", + "201523079349300607", + "201523079349300707", + "201523039349301092", + "201533099349301028", + "201523039349301227", + "201533089349300648", + "201513089349100511", + "201503099349201100", + "201543089349100729", + "201523229349300122", + "201523229349300142", + "201533179349303898", + "201533179349303908", + "201533179349303913", + "201523209349310247", + "201533179349300928", + "201523209349314977", + "201533179349302118", + "201523209349313352", + "201523209349313387", + "201523209349313392", + "201533179349303713", + "201533179349303723", + "201533179349302818", + "201523229349200117", + "201533179349201233", + "201523219349200142", + "201523219349200147", + "201523219349200207", + "201523039349300942", + "201523039349300977", + "201523039349300982", + "201533039349300948", + "201543039349300044", + "201543039349301154", + "201533079349300213", + "201533089349300538", + "201533079349300328", + "201533089349200978", + "201523079349201057", + "201533039349200308", + "201533099349301313", + "201523079349200447", + "201523079349200627", + "201533079349301248", + "201533079349301328", + "201533049349300403", + "201543069349201204", + "201543169349201579", + "201543109349200424", + "201523209349311052", + "201523239349300612", + "201523209349312367", + "201543109349100014", + "201543109349100124", + "201533099349301538", + "201523089349301817", + "201523089349301927", + "201533089349300633", + "201523069349200307", + "201533069349200328", + "201533069349200338", + "201533069349200428", + "201533069349200433", + "201523069349201167", + "201523069349201202", + "201523069349201257", + "201523069349201302", + "201533209349302198", + "201523239349300107", + "201533209349203173", + "201523239349300202", + "201533209349206183", + "201533209349206208", + "201533179349300543", + "201533209349204003", + "201533199349200413", + "201533199349200648", + "201533209349205633", + "201533209349205643", + "201533209349302503", + "201523209349311932", + "201533179349308283", + "201533179349308293", + "201533179349308308", + "201533209349202273", + "201533209349303908", + "201533209349306913", + "201533209349103568", + "201543159349200914", + "201543159349200919", + "201543099349200334", + "201543099349200624", + "201523239349300307", + "201533179349303018", + "201533179349306963", + "201533089349100613", + "201533089349100623", + "201533179349306108", + "201533179349203403", + "201533209349301973", + "201533179349101908", + "201533209349304733", + "201533189349200148", + "201533189349200213", + "201533189349200223", + "201533209349201793", + "201523049349300007", + "201523049349300107", + "201533099349301308", + "201503099349300215", + "201503099349300225", + "201533109349300523", + "201533089349301458", + "201523099349300347", + "201543039349200339", + "201523069349200962", + "201523069349201002", + "201543079349301354", + "201533209349203958", + "201533209349203963", + "201533209349203978", + "201533209349203983", + "201533209349205873", + "201533209349103663", + "201533209349103678", + "201533209349103238", + "201533189349100513", + "201503069349201110", + "201503069349201115", + "201503069349201150", + "201503069349201165", + "201503069349201215", + "201503039349200340", + "201513069349200956", + "201513099349201116", + "201523069349301482", + "201543169349201394", + "201543169349201369", + "201543139349100604", + "201523229349300047", + "201523209349310187", + "201523209349310192", + "201523209349312807", + "201523209349314007", + "201533089349100528", + "201523209349314152", + "201523219349200627", + "201523219349200712", + "201523219349200732", + "201523219349200802", + "201523219349200907", + "201533179349202178", + "201533179349203378", + "201533179349203398", + "201533209349300603", + "201533209349300608", + "201533209349300618", + "201533209349300623", + "201523069349301327", + "201523069349300707", + "201523089349301977", + "201533039349300038", + "201513089349100326", + "201523089349301827", + "201533089349200923", + "201523079349301177", + "201533039349300823", + "201533039349300828", + "201543159349101159", + "201543139349100014", + "201543139349100024", + "201543139349100029", + "201513089349201101", + "201513089349201106", + "201513089349201116", + "201513089349201156", + "201513089349201161", + "201513069349200331", + "201503079349200795", + "201503089349100030", + "201543089349301464", + "201533179349303958", + "201533179349303998", + "201523209349312622", + "201533179349101313", + "201533209349201068", + "201523099349200307", + "201523069349200337", + "201523219349201152", + "201533179349202388", + "201533179349202413", + "201533179349202453", + "201523209349314792", + "201543079349301174", + "201543079349301194", + "201543079349301214", + "201543079349301309", + "201533209349205268", + "201533209349201613", + "201533209349204028", + "201533209349204038", + "201533209349204053", + "201533209349305963", + "201533179349308908", + "201533199349300033", + "201533209349101623", + "201533209349101633", + "201533209349101643", + "201543119349200204", + "201543119349200304", + "201543159349200519", + "201543159349200529", + "201543149349201779", + "201543159349200609", + "201543149349201799", + "201543159349200614", + "201543159349200704", + "201543159349200734", + "201543159349200749", + "201543149349201439", + "201543159349100804", + "201543159349100809", + "201543099349200234", + "201543099349200339", + "201503099349200960", + "201513039349100006", + "201523209349308842", + "201523209349308857", + "201533179349306558", + "201543079349300119", + "201533179349302383", + "201533179349302388", + "201533179349302413", + "201533179349302423", + "201533179349101993", + "201533179349101998", + "201533209349304753", + "201533179349305163", + "201533209349305143", + "201533209349305178", + "201533189349300433", + "201523089349301007", + "201523099349301812", + "201533099349301168", + "201533099349301173", + "201523059349300002", + "201523059349300007", + "201523209349310582", + "201523219349300902", + "201523219349300927", + "201523219349300932", + "201533179349303218", + "201543079349200724", + "201533179349303963", + "201533179349201488", + "201533179349201513", + "201523209349315257", + "201533209349301013", + "201523069349300737", + "201523039349300142", + "201523089349301962", + "201523039349300912", + "201523089349300147", + "201523099349301322", + "201533089349200928", + "201523079349301522", + "201523079349301562", + "201533079349300603", + "201533209349304958", + "201533209349202433", + "201533209349103903", + "201533209349307203", + "201533079349200308", + "201503089349100620", + "201533089349301298", + "201543149349303884", + "201513069349100376", + "201543159349302009", + "201543169349301754", + "201543159349302049", + "201533089349300118", + "201543089349200419", + "201543169349301814", + "201543099349200014", + "201543099349200239", + "201543159349200949", + "201543149349201409", + "201543149349201429", + "201543099349201109", + "201543109349200404", + "201543139349100044", + "201543139349100104", + "201543139349100109", + "201523209349313967", + "201523209349313982", + "201523039349301162", + "201533039349301308", + "201523099349301372", + "201533099349300408", + "201533099349300433", + "201533099349300513", + "201533099349300623", + "201523099349301587", + "201533079349300708", + "201533089349301623", + "201533079349301718", + "201533099349300003", + "201533099349300008", + "201533089349301943", + "201533089349301953", + "201523099349200332", + "201523099349200402", + "201533209349304158", + "201533179349202848", + "201533179349202858", + "201533179349202873", + "201533179349202903", + "201533209349202243", + "201533209349202258", + "201533209349202298", + "201533209349202313", + "201533209349202343", + "201513079349200921", + "201523209349310107", + "201523209349313822", + "201533179349304353", + "201533179349200728", + "201523209349315237", + "201533209349200208", + "201533209349201173", + "201533209349201178", + "201533209349302323", + "201523209349311942", + "201533209349306063", + "201533209349307733", + "201533179349302083", + "201533179349302108", + "201533209349309743", + "201533209349310963", + "201533209349310968", + "201533209349310998", + "201533209349310378", + "201533209349308923", + "201533209349310413", + "201533209349311033", + "201533209349308968", + "201533209349310418", + "201533209349311133", + "201533209349314503", + "201543179349306014", + "201533209349309328", + "201533209349314523", + "201533209349314548", + "201533209349314553", + "201533209349310808", + "201533209349314683", + "201533209349314693", + "201533209349310838", + "201533209349314718", + "201533209349314723", + "201503089349200640", + "201513099349200901", + "201513079349200321", + "201503069349200425", + "201503069349200430", + "201503069349200515", + "201503069349200530", + "201503069349200755", + "201503069349200900", + "201503099349200145", + "201503099349200210", + "201513079349200541", + "201513089349200131", + "201513089349200206", + "201513079349100601", + "201523089349300307", + "201523209349308707", + "201523209349311307", + "201523209349314277", + "201523069349100212", + "201523209349310987", + "201543089349201069", + "201523209349311582", + "201523209349311657", + "201523039349100802", + "201523079349100007", + "201543089349301989", + "201533179349201188", + "201533209349300923", + "201533179349203573", + "201533209349306638", + "201523099349301782", + "201523099349301787", + "201523089349300032", + "201503069349300805", + "201503069349300815", + "201523039349301212", + "201523099349300047", + "201523099349300112", + "201533139349300103", + "201533089349300748", + "201543099349301914", + "201503069349201300", + "201543089349301469", + "201533179349301748", + "201533179349301758", + "201523209349313117", + "201523209349315172", + "201523209349315217", + "201523209349314172", + "201533179349302358", + "201533179349302363", + "201523239349100202", + "201523239349100307", + "201523239349100402", + "201533079349300148", + "201533089349300513", + "201543039349300509", + "201543039349300519", + "201523069349300892", + "201533039349200338", + "201533039349200343", + "201533039349200418", + "201543039349300624", + "201523069349300217", + "201523069349300247", + "201523069349300432", + "201523099349300237", + "201533089349300048", + "201533079349200303", + "201533079349200403", + "201523069349200972", + "201533209349205148", + "201533209349206148", + "201533209349201633", + "201533209349201643", + "201533199349200318", + "201533209349201398", + "201533209349101543", + "201533189349100603", + "201533209349306018", + "201533199349100708", + "201533199349100718", + "201533069349300238", + "201533179349306763", + "201523209349312602", + "201523209349312607", + "201533179349301013", + "201533179349303838", + "201543089349301684", + "201543089349301719", + "201543089349301724", + "201543089349301814", + "201543089349301909", + "201543089349301924", + "201543089349301949", + "201523239349200117", + "201533209349307668", + "201533209349307673", + "201533179349203813", + "201533209349204963", + "201533209349204988", + "201533179349202458", + "201533209349102443", + "201533179349309068", + "201533209349300428", + "201533199349301023", + "201533199349301028", + "201533179349309583", + "201533209349307183", + "201533179349306873", + "201533069349200213", + "201533069349200233", + "201543069349201104", + "201543159349100639", + "201543149349201534", + "201543139349100004", + "201543139349100034", + "201523209349308717", + "201543089349200729", + "201523209349316562", + "201523209349313147", + "201533179349302053", + "201533169349101053", + "201533179349306148", + "201543079349300814", + "201523209349312297", + "201523209349312322", + "201523209349312327", + "201523209349312337", + "201533179349201983", + "201523099349301962", + "201533089349301428", + "201523069349300127", + "201523219349201107", + "201533179349202373", + "201533209349302993", + "201533209349103268", + "201533209349103288", + "201533209349306393", + "201533179349309508", + "201533199349301408", + "201533179349303488", + "201533179349305663", + "201533179349305693", + "201533179349305708", + "201533179349306808", + "201533069349200343", + "201543089349200229", + "201543089349200244", + "201543089349200249", + "201543099349201204", + "201543169349201574", + "201543139349100419", + "201523209349314662", + "201523209349314712", + "201523209349313887", + "201533179349303978", + "201523209349312442", + "201533179349201388", + "201533179349202838", + "201523219349200307", + "201523209349312307", + "201523229349100702", + "201533209349304743", + "201533209349204813", + "201523069349300797", + "201523069349300812", + "201523069349300827", + "201503099349300235", + "201533089349301583", + "201533089349300433", + "201533079349301213", + "201533079349301223", + "201533209349204298", + "201533209349100828", + "201533209349206463", + "201533209349206818", + "201533099349200443", + "201533099349200518", + "201523089349200247", + "201523099349200437", + "201523089349200507", + "201523089349200512", + "201523089349200547", + "201523039349200832", + "201533089349200738", + "201533169349100923", + "201533179349303518", + "201543069349100369", + "201543069349100379", + "201543069349100414", + "201543069349100524", + "201543079349200629", + "201533179349307778", + "201533209349302523", + "201533209349307303", + "201533209349307333", + "201533209349307363", + "201523239349300862", + "201533209349303283", + "201533209349303298", + "201533209349303303", + "201533209349303773", + "201523099349200432", + "201523079349300612", + "201503089349100510", + "201533039349300213", + "201513099349300511", + "201543159349100124", + "201543159349100134", + "201543159349100139", + "201543149349100034", + "201543149349100049", + "201543149349303704", + "201543149349303804", + "201543149349101224", + "201543169349301619", + "201533159349304133", + "201543039349100624", + "201543039349100629", + "201533099349100038", + "201533089349300043", + "201543089349200044", + "201543089349200104", + "201543089349200119", + "201543089349200139", + "201543069349200314", + "201523209349310152", + "201523209349313832", + "201533179349201673", + "201533179349201678", + "201533179349201723", + "201543109349100404", + "201523209349312222", + "201523209349312232", + "201533209349303123", + "201533209349303163", + "201533179349307363", + "201533209349302293", + "201533209349204648", + "201533209349205588", + "201533209349307383", + "201533179349202793", + "201533179349202843", + "201533179349202853", + "201533179349202863", + "201533179349308278", + "201533179349308328", + "201533209349203058", + "201533209349102923", + "201533209349206783", + "201533209349303308", + "201533209349303358", + "201533209349303388", + "201533209349303708", + "201533209349303418", + "201503069349200810", + "201503069349200830", + "201503069349200990", + "201513089349200511", + "201513089349200526", + "201543079349300739", + "201543079349300834", + "201523209349313357", + "201523209349312217", + "201523239349200227", + "201523239349200312", + "201533179349200708", + "201533179349300308", + "201533179349300323", + "201533179349300333", + "201533179349300338", + "201533209349302678", + "201533179349203833", + "201523209349311832", + "201533209349103233", + "201533209349101553", + "201533209349306053", + "201503039349300600", + "201543159349100039", + "201543169349301719", + "201523089349100002", + "201523089349100017", + "201523089349100022", + "201523089349100207", + "201533099349100613", + "201533209349310883", + "201533209349310893", + "201533209349310923", + "201533209349310928", + "201543169349201529", + "201543149349202024", + "201543159349201019", + "201543159349200724", + "201543109349200209", + "201543109349200314", + "201543109349200504", + "201543109349200434", + "201543099349200224", + "201543099349200429", + "201543099349301959", + "201513079349201066", + "201513079349201101", + "201513039349200211", + "201503099349100735", + "201533069349300143", + "201533069349300618", + "201523099349301612", + "201543139349100749", + "201543139349100754", + "201543139349100764", + "201523209349308752", + "201523209349308762", + "201523209349310132", + "201523209349308852", + "201523209349311232", + "201523069349100322", + "201523069349100327", + "201543079349100204", + "201523209349313587", + "201533179349305998", + "201543099349100009", + "201523239349100407", + "201523239349100502", + "201533179349102208", + "201523099349301772", + "201533099349301158", + "201503099349300130", + "201523079349301702", + "201523079349301712", + "201543069349201114", + "201543069349201304", + "201543159349200134", + "201523209349312642", + "201533179349306993", + "201533169349100218", + "201533179349201218", + "201533179349203553", + "201523209349312382", + "201533209349301128", + "201533179349305138", + "201523099349300012", + "201543039349301004", + "201533109349300648", + "201533039349300848", + "201543039349301109", + "201523079349301527", + "201523079349200032", + "201533089349200743", + "201543039349300544", + "201533069349301108", + "201533089349300023", + "201533039349200213", + "201533079349300818", + "201533039349300923", + "201523099349200737", + "201533079349300903", + "201523069349200342", + "201533209349202168", + "201533209349202178", + "201533209349302238", + "201533209349201453", + "201533209349201548", + "201523209349312842", + "201533209349201278", + "201533209349201418", + "201533199349300638", + "201533209349306133", + "201533209349103323", + "201533209349103328", + "201543159349200419", + "201523209349313022", +] # csvcut -c 9 index_2015.csv | head -n 1000 > returns_2015.txt -object_ids_2015 = ['201542399349300614', '201542399349300619', '201542399349300629', '201542399349300634', '201542399349300719', '201542399349300724', '201542399349300739', '201522369349300102', '201522369349300112', '201522369349300117', '201522369349300122', '201522369349300127', '201522369349300132', '201522369349300137', '201522369349300142', '201522369349300147', '201522369349300202', '201522369349300207', '201522369349300212', '201522369349300227', '201522369349300307', '201522369349300317', '201532299349304913', '201532299349304953', '201542379349300864', '201542379349300874', '201542379349300884', '201542379349301004', '201542379349301009', '201532299349302418', '201532299349302423', '201532299349302433', '201532299349302443', '201532299349302473', '201532299349302483', '201532299349302488', '201532299349302498', '201532299349302503', '201532299349302518', '201532299349302523', '201532299349302543', '201532299349302558', '201542399349200309', '201542399349200319', '201542399349200324', '201542399349200334', '201542399349200339', '201542399349200609', '201542399349200614', '201542399349200709', '201542399349200714', '201542399349200814', '201542399349200909', '201542399349201004', '201522379349200037', '201522379349200127', '201522379349200202', '201522379349200212', '201522379349200307', '201522379349200312', '201522379349200322', '201522379349200402', '201522379349200612', '201522379349200712', '201522379349200722', '201532369349200018', '201532369349200023', '201532399349201003', '201542399349200019', '201542399349200104', '201542399349200119', '201542399349200129', '201542399349200504', '201542399349200509', '201502649349200005', '201502649349200010', '201502649349200100', '201502649349200105', '201502649349200110', '201502649349200120', '201502649349200125', '201502649349200200', '201502649349200205', '201502649349200210', '201502649349200215', '201502649349200225', '201502649349200250', '201502649349200255', '201502649349200265', '201502649349200300', '201502649349200315', '201502649349200330', '201502649349200355', '201522679349200002', '201522679349200022', '201522679349200032', '201522679349200102', '201502669349301050', '201502669349301070', '201522659349300002', '201522659349300012', '201522659349300042', '201532649349300343', '201532649349300348', '201532649349300433', '201532649349300438', '201532649349300538', '201532649349300603', '201532649349300658', '201532649349300663', '201542649349300034', '201542649349300039', '201542649349300184', '201542649349300234', '201542649349300329', '201542649349300409', '201512659349200121', '201512659349200131', '201512659349200201', '201512659349200211', '201512659349200316', '201512659349200326', '201512659349200501', '201512659349200526', '201512659349200616', '201512659349200621', '201512659349200716', '201522649349200267', '201522649349200357', '201522649349200372', '201522649349200402', '201522649349200407', '201532649349200008', '201532649349200013', '201532649349200103', '201542619349200304', '201542619349200514', '201542619349200809', '201512669349301106', '201522669349300002', '201522669349300017', '201522669349300022', '201532659349300538', '201532659349300548', '201532659349300608', '201532659349300628', '201532659349300638', '201532659349300728', '201532659349300753', '201532659349300953', '201532659349301003', '201532659349301053', '201532659349301058', '201542659349300224', '201542659349300234', '201542659349300239', '201542659349300314', '201512649349300026', '201512649349300036', '201532299349302568', '201532299349304203', '201532299349304208', '201532299349304223', '201522369349300517', '201522369349300527', '201522369349300532', '201522369349300537', '201522369349300542', '201522369349300547', '201522369349300602', '201522369349300607', '201522369349300612', '201522369349300617', '201522369349300622', '201522369349300632', '201522369349300637', '201522369349300652', '201522369349300667', '201522369349300687', '201522369349300802', '201532339349300003', '201532339349300008', '201532339349300018', '201542389349300009', '201542389349300014', '201542269349301474', '201542269349301479', '201542269349301489', '201542269349301499', '201542269349301554', '201542269349301564', '201542269349301574', '201542269349301584', '201542269349301589', '201542269349301599', '201542269349301659', '201542269349301664', '201542269349301674', '201542269349301679', '201542269349301694', '201542269349301699', '201542279349300039', '201542399349200819', '201542399349200904', '201542399349200914', '201532299349100643', '201532299349100648', '201532299349100703', '201532299349100713', '201532299349100718', '201532299349100723', '201542269349100129', '201542269349100134', '201542269349100139', '201542269349100144', '201542269349100149', '201542269349100204', '201542269349100209', '201542269349100219', '201542269349100224', '201542269349100234', '201542269349100239', '201542269349100249', '201542269349100304', '201542269349100314', '201542269349100324', '201542269349100329', '201542269349100339', '201502649349300020', '201542669349300224', '201542669349300234', '201542669349300244', '201542669349300304', '201542669349300309', '201542669349300324', '201542669349300329', '201542669349300409', '201542669349300414', '201542669349300429', '201542669349300434', '201542669349300444', '201542669349300509', '201542669349300514', '201542669349300524', '201542669349300529', '201542669349300534', '201502379349200000', '201502379349200010', '201502379349200020', '201502379349200040', '201502379349200115', '201502379349200135', '201502379349200305', '201502379349200310', '201502379349200320', '201502379349200620', '201502389349200410', '201502389349200415', '201502389349200520', '201502389349200630', '201502389349200705', '201502389349200730', '201502389349200760', '201512379349200136', '201512379349200316', '201512379349200711', '201502389349200300', '201512379349200016', '201512379349200026', '201512379349200031', '201512379349200041', '201512379349200101', '201512379349200111', '201512379349200116', '201512379349200126', '201512379349200201', '201512379349200211', '201512379349200216', '201512379349200226', '201512379349200301', '201512379349200306', '201512379349200321', '201512379349200401', '201512379349200406', '201512379349200416', '201512379349200601', '201512379349200611', '201512649349300051', '201512649349300151', '201512649349300166', '201512649349300176', '201512649349300196', '201512649349300226', '201512649349300231', '201512649349300301', '201512649349300316', '201512649349300326', '201512649349300341', '201542679349300134', '201542679349300149', '201542679349300214', '201542679349300219', '201542679349300239', '201542679349300244', '201542679349300314', '201542679349300319', '201532319349200118', '201532319349200323', '201542269349201814', '201542269349201874', '201542269349201884', '201542269349201969', '201542269349201979', '201542299349200214', '201542299349200244', '201542299349200329', '201542299349200429', '201542299349200509', '201542299349200524', '201542309349200034', '201542309349200104', '201542309349200134', '201542309349200244', '201542309349200404', '201502349349300000', '201502349349300200', '201502349349300700', '201502359349300100', '201502359349300400', '201502369349300000', '201542279349300104', '201542279349300114', '201542279349300119', '201512399349300006', '201512399349300016', '201512399349300021', '201512399349300031', '201512399349300036', '201512399349300116', '201512399349300121', '201512399349300136', '201542339349300119', '201542339349300124', '201542339349300129', '201542339349300134', '201542339349300204', '201542339349300214', '201542339349300234', '201542339349300309', '201542339349300314', '201542339349300334', '201542339349300404', '201512399349300321', '201512399349300336', '201512399349300341', '201512399349300411', '201532369349300503', '201532369349300508', '201532369349300513', '201532369349300518', '201532369349300523', '201542339349300419', '201542339349300504', '201542339349300519', '201542339349300529', '201542339349300609', '201542339349300614', '201542339349300624', '201542339349300804', '201542339349300814', '201542339349300819', '201542349349300204', '201542359349300304', '201502619349300005', '201502619349300010', '201502619349300015', '201502619349300020', '201502619349300025', '201502619349300100', '201502619349300105', '201502619349300110', '201502619349300125', '201502619349300135', '201502619349300200', '201502619349300205', '201502619349300215', '201502619349300225', '201502619349300230', '201542649349300619', '201542649349300624', '201542649349300634', '201542649349300639', '201542649349300654', '201542649349300664', '201542649349300684', '201542649349300754', '201512619349300911', '201512619349301001', '201512619349301006', '201512619349301101', '201512629349300101', '201512679349300716', '201512679349300731', '201512679349300736', '201512679349300746', '201512679349300801', '201512679349300816', '201512679349300826', '201512679349300836', '201512679349300901', '201512679349300906', '201512679349300921', '201512679349301011', '201512679349301101', '201502649349300615', '201502649349300620', '201512679349100006', '201512679349100101', '201512679349100106', '201512679349100201', '201512679349100206', '201512679349100306', '201522669349100002', '201522669349100102', '201522669349100202', '201522669349100402', '201522669349100602', '201522669349100702', '201532659349100503', '201532659349100508', '201532659349100603', '201542659349100009', '201542659349100104', '201542659349100204', '201542659349100404', '201542659349100504', '201542659349100604', '201522649349300162', '201522649349300172', '201522649349300177', '201522649349300182', '201522649349300187', '201532619349300708', '201532619349300718', '201532619349300808', '201532619349300813', '201532619349300903', '201532619349300908', '201532619349300918', '201532619349301103', '201532639349300053', '201502369349300005', '201512369349300001', '201512369349300031', '201512369349300111', '201512369349300116', '201512369349300126', '201512369349300221', '201512369349300231', '201512369349300246', '201512369349300336', '201512369349300451', '201512369349300496', '201512369349300521', '201512369349300546', '201512369349300621', '201532319349200423', '201532319349200433', '201532319349200518', '201532319349200543', '201532319349200608', '201532319349200713', '201532319349200833', '201542269349202034', '201542269349202039', '201542269349202059', '201542269349202074', '201542269349202094', '201542299349200609', '201542299349200619', '201542299349200724', '201542299349200814', '201542299349200844', '201542309349200524', '201542309349200534', '201542309349200619', '201532269349200713', '201532269349200738', '201532269349201038', '201532269349201043', '201532269349201053', '201532269349201078', '201532269349201108', '201532339349300508', '201532339349300513', '201532339349300528', '201532339349300533', '201532339349300608', '201532339349300613', '201532339349300623', '201532339349300633', '201532339349300808', '201532339349300823', '201532349349300203', '201532349349300303', '201532399349300238', '201532399349300243', '201532399349300308', '201542389349300844', '201542389349300849', '201542389349300914', '201522339349300127', '201522339349300137', '201522339349300202', '201522339349300217', '201522339349300222', '201522339349300227', '201522339349300232', '201522339349300302', '201522339349300307', '201522339349300317', '201522339349300322', '201522339349300327', '201522339349300407', '201522339349300417', '201522339349300427', '201522339349300517', '201522339349300522', '201542369349300244', '201542369349300249', '201542369349300324', '201542369349300404', '201542369349300454', '201532379349300883', '201532379349300903', '201542369349300474', '201502649349300640', '201502649349300650', '201502649349300655', '201502649349300660', '201502649349300670', '201502649349300675', '201502649349300700', '201522679349300612', '201522679349300617', '201522679349300702', '201522679349300712', '201522679349300717', '201532679349300013', '201532679349300023', '201532679349300033', '201532679349300108', '201532679349300113', '201532679349300133', '201502659349300105', '201502659349300110', '201502659349300120', '201502659349300135', '201502659349300140', '201512649349300416', '201512649349300421', '201512649349300431', '201512649349300436', '201512649349300501', '201512649349300511', '201512649349300521', '201512649349300531', '201512649349300541', '201512649349300546', '201512649349300611', '201512649349300631', '201512649349300641', '201502649349300535', '201502649349300610', '201502649349300625', '201502649349300635', '201502649349300665', '201502649349300680', '201502649349300690', '201532639349300203', '201542619349300109', '201542619349300114', '201542619349300119', '201542619349300124', '201542619349300134', '201542619349300204', '201542619349300209', '201542619349300219', '201512669349300326', '201512669349300331', '201512669349300336', '201512669349300411', '201512669349300416', '201532299349201658', '201532299349201668', '201532299349201703', '201532299349201708', '201542269349202354', '201542269349202364', '201542299349200919', '201542299349201129', '201542299349201139', '201542309349200804', '201542309349200964', '201542309349200974', '201542309349201104', '201532269349201478', '201532269349201518', '201532269349201543', '201532269349201563', '201532269349201603', '201532269349201633', '201532289349200113', '201532289349200118', '201532289349200313', '201532289349200603', '201532289349200608', '201532299349201993', '201532329349200028', '201532329349200113', '201532329349200303', '201532329349200403', '201532329349200423', '201542299349201424', '201542299349201434', '201542299349201554', '201542299349201569', '201542299349201604', '201532269349201783', '201532269349201793', '201532269349201803', '201532269349201818', '201532299349202373', '201532299349202388', '201532299349202393', '201542369349300484', '201542369349300489', '201542369349300509', '201542369349300524', '201542369349300529', '201542369349300539', '201542369349300544', '201542369349300609', '201542369349300619', '201542369349300624', '201542369349300634', '201542369349300649', '201542369349300654', '201542369349300669', '201542369349300674', '201502379349300400', '201502379349300500', '201502379349300600', '201502379349300610', '201502379349300615', '201502379349300620', '201502379349300700', '201502379349300705', '201502379349300710', '201502379349300720', '201502379349300725', '201502379349300730', '201502379349300740', '201502379349300745', '201502379349300855', '201502379349300860', '201502379349300875', '201502379349300905', '201502379349300910', '201502379349301005', '201542379349300729', '201542379349300734', '201532269349302848', '201532269349302868', '201532269349302918', '201532269349302923', '201532269349302973', '201532269349302993', '201502649349300710', '201502649349300750', '201502649349300800', '201512629349300601', '201512649349300006', '201512649349300011', '201522619349300007', '201522619349300012', '201522619349300127', '201522679349300227', '201522679349300232', '201522679349300412', '201522679349300517', '201522679349300722', '201532679349300028', '201532679349300118', '201502679349200720', '201502679349200750', '201522669349200002', '201522669349200017', '201522669349200022', '201522669349200107', '201522669349200117', '201522669349200122', '201522669349200202', '201522669349200217', '201522669349200222', '201542659349200424', '201542659349200504', '201542659349200524', '201542659349200529', '201542659349200614', '201542659349200619', '201542659349200709', '201542659349200714', '201512659349200001', '201512659349200006', '201512659349200011', '201512659349200021', '201512659349200101', '201512659349200111', '201512659349200126', '201512669349300431', '201512669349300441', '201512669349300446', '201512669349300511', '201512669349300531', '201512669349300601', '201512669349300621', '201512669349300626', '201522659349300722', '201522659349300757', '201522659349300762', '201522659349300802', '201522659349300812', '201522659349300907', '201522659349301002', '201522659349301052', '201512659349300041', '201512659349300046', '201512659349300101', '201512659349300121', '201512659349300126', '201512659349300206', '201512659349300216', '201512659349300221', '201512659349300236', '201512659349300241', '201512659349300301', '201512659349300311', '201512659349300326', '201512659349300401', '201512659349300416', '201512659349300431', '201522649349300547', '201522649349300602', '201542639349300204', '201502659349300235', '201502659349300300', '201502659349300330', '201502659349300415', '201502659349300430', '201502659349300545', '201502659349300625', '201512649349300661', '201532299349202433', '201532299349202508', '201532299349202518', '201532299349202533', '201532299349202538', '201532299349202568', '201542279349200014', '201542279349200139', '201542279349200229', '201542279349200324', '201542279349200429', '201542299349201924', '201542299349201984', '201542299349202154', '201532299349202613', '201532299349202618', '201532299349202668', '201532299349202678', '201532299349202723', '201532299349202763', '201532299349202778', '201542279349200634', '201542279349200714', '201542279349200724', '201542279349200734', '201542279349200809', '201542299349202209', '201542299349202259', '201542299349202334', '201542299349202359', '201542299349202369', '201542299349202379', '201542299349202394', '201532299349100018', '201532299349100023', '201532299349100028', '201532299349100033', '201532299349100038', '201532299349100043', '201532299349100048', '201532299349100103', '201532299349100108', '201532269349303013', '201532269349303018', '201532299349302438', '201532299349302453', '201532299349302463', '201532299349302468', '201532299349302538', '201532299349302553', '201532299349304308', '201532299349304318', '201532299349304353', '201532299349304423', '201532299349304443', '201532299349304463', '201512399349300731', '201512399349300741', '201522389349300042', '201522389349300047', '201522389349300102', '201522389349300107', '201522389349300202', '201522389349300207', '201522389349300212', '201522389349300242', '201522389349300247', '201522389349300307', '201522389349300312', '201522389349300317', '201522389349300322', '201522389349300327', '201532369349300608', '201532369349300658', '201532369349300673', '201532369349300683', '201532369349300758', '201532369349300803', '201532269349303203', '201532269349303233', '201532299349300113', '201532299349300128', '201532299349302648', '201532299349302703', '201512659349200216', '201522649349200317', '201522649349200322', '201522649349200332', '201522649349200362', '201522649349200367', '201542619349200504', '201542619349200509', '201542619349200604', '201542619349200704', '201542619349200804', '201542619349200904', '201502669349300410', '201502669349300420', '201502669349300425', '201502669349300440', '201502669349300445', '201502669349300500', '201502669349300515', '201502669349300525', '201502669349300530', '201502669349300535', '201502669349300600', '201502669349300605', '201502669349300610', '201502669349300615', '201502669349300620', '201502669349300630', '201502669349300645', '201502669349300800', '201512379349300911', '201512379349301006', '201512379349301016', '201512379349301021', '201522669349300952', '201522669349301057', '201522669349301067', '201522669349301102', '201532669349300003', '201532669349300008', '201532669349300023', '201532669349300108', '201532669349300113', '201512649349300681', '201512649349300701', '201522649349300027', '201522649349300152', '201522649349300167', '201532619349300203', '201532619349300213', '201532619349300238', '201532619349300303', '201532619349300418', '201532619349300513', '201532619349300603', '201542679349300919', '201502639349200250', '201512619349200001', '201512619349200011', '201512619349200101', '201512619349200206', '201512619349200211', '201512619349200306', '201512619349200311', '201512619349200316', '201512619349200321', '201512619349200401', '201512619349200411', '201512619349200501', '201512619349200601', '201512619349200606', '201512619349200701', '201512619349200711', '201512619349200801', '201512619349200811', '201532299349100113', '201532299349100118', '201532299349100123', '201532299349100128', '201532299349100133', '201532299349100138', '201532299349100203', '201532299349100208', '201532299349100213', '201532299349100218', '201532299349100223', '201532299349100233', '201532299349100238', '201532299349100248', '201532299349100303', '201542269349100004', '201542269349100014', '201542299349101134', '201542299349101154', '201542299349101159', '201542299349101169', '201542299349101174', '201542299349101184', '201542299349101189', '201542299349101209', '201542299349101219', '201542299349101224', '201542299349101254', '201542299349101259', '201542299349101269', '201542309349100004', '201542309349100104', '201542309349100109', '201542309349100304', '201542309349100314', '201532299349300603', '201532299349300613', '201532299349300618', '201532299349300633', '201532299349300638', '201532299349300648', '201532299349300713', '201532299349302723', '201532299349302753', '201532299349304498', '201532299349304578', '201532299349304603', '201532299349304628', '201532299349304638', '201532299349304653', '201532299349304668', '201542269349301714', '201542269349301719', '201542269349301809', '201542269349301814', '201522389349300607', '201522389349300612', '201522389349300617', '201522389349300622', '201522389349300632', '201522389349300637', '201522389349300642', '201522389349300647', '201522389349300702', '201522389349300707', '201522389349300712', '201522389349300717', '201522389349300727', '201522389349300732', '201522389349300802', '201522389349300822', '201522389349300827', '201522389349300902', '201522389349300907', '201522389349300917', '201532379349300013', '201522399349300032'] \ No newline at end of file +object_ids_2015 = [ + "201542399349300614", + "201542399349300619", + "201542399349300629", + "201542399349300634", + "201542399349300719", + "201542399349300724", + "201542399349300739", + "201522369349300102", + "201522369349300112", + "201522369349300117", + "201522369349300122", + "201522369349300127", + "201522369349300132", + "201522369349300137", + "201522369349300142", + "201522369349300147", + "201522369349300202", + "201522369349300207", + "201522369349300212", + "201522369349300227", + "201522369349300307", + "201522369349300317", + "201532299349304913", + "201532299349304953", + "201542379349300864", + "201542379349300874", + "201542379349300884", + "201542379349301004", + "201542379349301009", + "201532299349302418", + "201532299349302423", + "201532299349302433", + "201532299349302443", + "201532299349302473", + "201532299349302483", + "201532299349302488", + "201532299349302498", + "201532299349302503", + "201532299349302518", + "201532299349302523", + "201532299349302543", + "201532299349302558", + "201542399349200309", + "201542399349200319", + "201542399349200324", + "201542399349200334", + "201542399349200339", + "201542399349200609", + "201542399349200614", + "201542399349200709", + "201542399349200714", + "201542399349200814", + "201542399349200909", + "201542399349201004", + "201522379349200037", + "201522379349200127", + "201522379349200202", + "201522379349200212", + "201522379349200307", + "201522379349200312", + "201522379349200322", + "201522379349200402", + "201522379349200612", + "201522379349200712", + "201522379349200722", + "201532369349200018", + "201532369349200023", + "201532399349201003", + "201542399349200019", + "201542399349200104", + "201542399349200119", + "201542399349200129", + "201542399349200504", + "201542399349200509", + "201502649349200005", + "201502649349200010", + "201502649349200100", + "201502649349200105", + "201502649349200110", + "201502649349200120", + "201502649349200125", + "201502649349200200", + "201502649349200205", + "201502649349200210", + "201502649349200215", + "201502649349200225", + "201502649349200250", + "201502649349200255", + "201502649349200265", + "201502649349200300", + "201502649349200315", + "201502649349200330", + "201502649349200355", + "201522679349200002", + "201522679349200022", + "201522679349200032", + "201522679349200102", + "201502669349301050", + "201502669349301070", + "201522659349300002", + "201522659349300012", + "201522659349300042", + "201532649349300343", + "201532649349300348", + "201532649349300433", + "201532649349300438", + "201532649349300538", + "201532649349300603", + "201532649349300658", + "201532649349300663", + "201542649349300034", + "201542649349300039", + "201542649349300184", + "201542649349300234", + "201542649349300329", + "201542649349300409", + "201512659349200121", + "201512659349200131", + "201512659349200201", + "201512659349200211", + "201512659349200316", + "201512659349200326", + "201512659349200501", + "201512659349200526", + "201512659349200616", + "201512659349200621", + "201512659349200716", + "201522649349200267", + "201522649349200357", + "201522649349200372", + "201522649349200402", + "201522649349200407", + "201532649349200008", + "201532649349200013", + "201532649349200103", + "201542619349200304", + "201542619349200514", + "201542619349200809", + "201512669349301106", + "201522669349300002", + "201522669349300017", + "201522669349300022", + "201532659349300538", + "201532659349300548", + "201532659349300608", + "201532659349300628", + "201532659349300638", + "201532659349300728", + "201532659349300753", + "201532659349300953", + "201532659349301003", + "201532659349301053", + "201532659349301058", + "201542659349300224", + "201542659349300234", + "201542659349300239", + "201542659349300314", + "201512649349300026", + "201512649349300036", + "201532299349302568", + "201532299349304203", + "201532299349304208", + "201532299349304223", + "201522369349300517", + "201522369349300527", + "201522369349300532", + "201522369349300537", + "201522369349300542", + "201522369349300547", + "201522369349300602", + "201522369349300607", + "201522369349300612", + "201522369349300617", + "201522369349300622", + "201522369349300632", + "201522369349300637", + "201522369349300652", + "201522369349300667", + "201522369349300687", + "201522369349300802", + "201532339349300003", + "201532339349300008", + "201532339349300018", + "201542389349300009", + "201542389349300014", + "201542269349301474", + "201542269349301479", + "201542269349301489", + "201542269349301499", + "201542269349301554", + "201542269349301564", + "201542269349301574", + "201542269349301584", + "201542269349301589", + "201542269349301599", + "201542269349301659", + "201542269349301664", + "201542269349301674", + "201542269349301679", + "201542269349301694", + "201542269349301699", + "201542279349300039", + "201542399349200819", + "201542399349200904", + "201542399349200914", + "201532299349100643", + "201532299349100648", + "201532299349100703", + "201532299349100713", + "201532299349100718", + "201532299349100723", + "201542269349100129", + "201542269349100134", + "201542269349100139", + "201542269349100144", + "201542269349100149", + "201542269349100204", + "201542269349100209", + "201542269349100219", + "201542269349100224", + "201542269349100234", + "201542269349100239", + "201542269349100249", + "201542269349100304", + "201542269349100314", + "201542269349100324", + "201542269349100329", + "201542269349100339", + "201502649349300020", + "201542669349300224", + "201542669349300234", + "201542669349300244", + "201542669349300304", + "201542669349300309", + "201542669349300324", + "201542669349300329", + "201542669349300409", + "201542669349300414", + "201542669349300429", + "201542669349300434", + "201542669349300444", + "201542669349300509", + "201542669349300514", + "201542669349300524", + "201542669349300529", + "201542669349300534", + "201502379349200000", + "201502379349200010", + "201502379349200020", + "201502379349200040", + "201502379349200115", + "201502379349200135", + "201502379349200305", + "201502379349200310", + "201502379349200320", + "201502379349200620", + "201502389349200410", + "201502389349200415", + "201502389349200520", + "201502389349200630", + "201502389349200705", + "201502389349200730", + "201502389349200760", + "201512379349200136", + "201512379349200316", + "201512379349200711", + "201502389349200300", + "201512379349200016", + "201512379349200026", + "201512379349200031", + "201512379349200041", + "201512379349200101", + "201512379349200111", + "201512379349200116", + "201512379349200126", + "201512379349200201", + "201512379349200211", + "201512379349200216", + "201512379349200226", + "201512379349200301", + "201512379349200306", + "201512379349200321", + "201512379349200401", + "201512379349200406", + "201512379349200416", + "201512379349200601", + "201512379349200611", + "201512649349300051", + "201512649349300151", + "201512649349300166", + "201512649349300176", + "201512649349300196", + "201512649349300226", + "201512649349300231", + "201512649349300301", + "201512649349300316", + "201512649349300326", + "201512649349300341", + "201542679349300134", + "201542679349300149", + "201542679349300214", + "201542679349300219", + "201542679349300239", + "201542679349300244", + "201542679349300314", + "201542679349300319", + "201532319349200118", + "201532319349200323", + "201542269349201814", + "201542269349201874", + "201542269349201884", + "201542269349201969", + "201542269349201979", + "201542299349200214", + "201542299349200244", + "201542299349200329", + "201542299349200429", + "201542299349200509", + "201542299349200524", + "201542309349200034", + "201542309349200104", + "201542309349200134", + "201542309349200244", + "201542309349200404", + "201502349349300000", + "201502349349300200", + "201502349349300700", + "201502359349300100", + "201502359349300400", + "201502369349300000", + "201542279349300104", + "201542279349300114", + "201542279349300119", + "201512399349300006", + "201512399349300016", + "201512399349300021", + "201512399349300031", + "201512399349300036", + "201512399349300116", + "201512399349300121", + "201512399349300136", + "201542339349300119", + "201542339349300124", + "201542339349300129", + "201542339349300134", + "201542339349300204", + "201542339349300214", + "201542339349300234", + "201542339349300309", + "201542339349300314", + "201542339349300334", + "201542339349300404", + "201512399349300321", + "201512399349300336", + "201512399349300341", + "201512399349300411", + "201532369349300503", + "201532369349300508", + "201532369349300513", + "201532369349300518", + "201532369349300523", + "201542339349300419", + "201542339349300504", + "201542339349300519", + "201542339349300529", + "201542339349300609", + "201542339349300614", + "201542339349300624", + "201542339349300804", + "201542339349300814", + "201542339349300819", + "201542349349300204", + "201542359349300304", + "201502619349300005", + "201502619349300010", + "201502619349300015", + "201502619349300020", + "201502619349300025", + "201502619349300100", + "201502619349300105", + "201502619349300110", + "201502619349300125", + "201502619349300135", + "201502619349300200", + "201502619349300205", + "201502619349300215", + "201502619349300225", + "201502619349300230", + "201542649349300619", + "201542649349300624", + "201542649349300634", + "201542649349300639", + "201542649349300654", + "201542649349300664", + "201542649349300684", + "201542649349300754", + "201512619349300911", + "201512619349301001", + "201512619349301006", + "201512619349301101", + "201512629349300101", + "201512679349300716", + "201512679349300731", + "201512679349300736", + "201512679349300746", + "201512679349300801", + "201512679349300816", + "201512679349300826", + "201512679349300836", + "201512679349300901", + "201512679349300906", + "201512679349300921", + "201512679349301011", + "201512679349301101", + "201502649349300615", + "201502649349300620", + "201512679349100006", + "201512679349100101", + "201512679349100106", + "201512679349100201", + "201512679349100206", + "201512679349100306", + "201522669349100002", + "201522669349100102", + "201522669349100202", + "201522669349100402", + "201522669349100602", + "201522669349100702", + "201532659349100503", + "201532659349100508", + "201532659349100603", + "201542659349100009", + "201542659349100104", + "201542659349100204", + "201542659349100404", + "201542659349100504", + "201542659349100604", + "201522649349300162", + "201522649349300172", + "201522649349300177", + "201522649349300182", + "201522649349300187", + "201532619349300708", + "201532619349300718", + "201532619349300808", + "201532619349300813", + "201532619349300903", + "201532619349300908", + "201532619349300918", + "201532619349301103", + "201532639349300053", + "201502369349300005", + "201512369349300001", + "201512369349300031", + "201512369349300111", + "201512369349300116", + "201512369349300126", + "201512369349300221", + "201512369349300231", + "201512369349300246", + "201512369349300336", + "201512369349300451", + "201512369349300496", + "201512369349300521", + "201512369349300546", + "201512369349300621", + "201532319349200423", + "201532319349200433", + "201532319349200518", + "201532319349200543", + "201532319349200608", + "201532319349200713", + "201532319349200833", + "201542269349202034", + "201542269349202039", + "201542269349202059", + "201542269349202074", + "201542269349202094", + "201542299349200609", + "201542299349200619", + "201542299349200724", + "201542299349200814", + "201542299349200844", + "201542309349200524", + "201542309349200534", + "201542309349200619", + "201532269349200713", + "201532269349200738", + "201532269349201038", + "201532269349201043", + "201532269349201053", + "201532269349201078", + "201532269349201108", + "201532339349300508", + "201532339349300513", + "201532339349300528", + "201532339349300533", + "201532339349300608", + "201532339349300613", + "201532339349300623", + "201532339349300633", + "201532339349300808", + "201532339349300823", + "201532349349300203", + "201532349349300303", + "201532399349300238", + "201532399349300243", + "201532399349300308", + "201542389349300844", + "201542389349300849", + "201542389349300914", + "201522339349300127", + "201522339349300137", + "201522339349300202", + "201522339349300217", + "201522339349300222", + "201522339349300227", + "201522339349300232", + "201522339349300302", + "201522339349300307", + "201522339349300317", + "201522339349300322", + "201522339349300327", + "201522339349300407", + "201522339349300417", + "201522339349300427", + "201522339349300517", + "201522339349300522", + "201542369349300244", + "201542369349300249", + "201542369349300324", + "201542369349300404", + "201542369349300454", + "201532379349300883", + "201532379349300903", + "201542369349300474", + "201502649349300640", + "201502649349300650", + "201502649349300655", + "201502649349300660", + "201502649349300670", + "201502649349300675", + "201502649349300700", + "201522679349300612", + "201522679349300617", + "201522679349300702", + "201522679349300712", + "201522679349300717", + "201532679349300013", + "201532679349300023", + "201532679349300033", + "201532679349300108", + "201532679349300113", + "201532679349300133", + "201502659349300105", + "201502659349300110", + "201502659349300120", + "201502659349300135", + "201502659349300140", + "201512649349300416", + "201512649349300421", + "201512649349300431", + "201512649349300436", + "201512649349300501", + "201512649349300511", + "201512649349300521", + "201512649349300531", + "201512649349300541", + "201512649349300546", + "201512649349300611", + "201512649349300631", + "201512649349300641", + "201502649349300535", + "201502649349300610", + "201502649349300625", + "201502649349300635", + "201502649349300665", + "201502649349300680", + "201502649349300690", + "201532639349300203", + "201542619349300109", + "201542619349300114", + "201542619349300119", + "201542619349300124", + "201542619349300134", + "201542619349300204", + "201542619349300209", + "201542619349300219", + "201512669349300326", + "201512669349300331", + "201512669349300336", + "201512669349300411", + "201512669349300416", + "201532299349201658", + "201532299349201668", + "201532299349201703", + "201532299349201708", + "201542269349202354", + "201542269349202364", + "201542299349200919", + "201542299349201129", + "201542299349201139", + "201542309349200804", + "201542309349200964", + "201542309349200974", + "201542309349201104", + "201532269349201478", + "201532269349201518", + "201532269349201543", + "201532269349201563", + "201532269349201603", + "201532269349201633", + "201532289349200113", + "201532289349200118", + "201532289349200313", + "201532289349200603", + "201532289349200608", + "201532299349201993", + "201532329349200028", + "201532329349200113", + "201532329349200303", + "201532329349200403", + "201532329349200423", + "201542299349201424", + "201542299349201434", + "201542299349201554", + "201542299349201569", + "201542299349201604", + "201532269349201783", + "201532269349201793", + "201532269349201803", + "201532269349201818", + "201532299349202373", + "201532299349202388", + "201532299349202393", + "201542369349300484", + "201542369349300489", + "201542369349300509", + "201542369349300524", + "201542369349300529", + "201542369349300539", + "201542369349300544", + "201542369349300609", + "201542369349300619", + "201542369349300624", + "201542369349300634", + "201542369349300649", + "201542369349300654", + "201542369349300669", + "201542369349300674", + "201502379349300400", + "201502379349300500", + "201502379349300600", + "201502379349300610", + "201502379349300615", + "201502379349300620", + "201502379349300700", + "201502379349300705", + "201502379349300710", + "201502379349300720", + "201502379349300725", + "201502379349300730", + "201502379349300740", + "201502379349300745", + "201502379349300855", + "201502379349300860", + "201502379349300875", + "201502379349300905", + "201502379349300910", + "201502379349301005", + "201542379349300729", + "201542379349300734", + "201532269349302848", + "201532269349302868", + "201532269349302918", + "201532269349302923", + "201532269349302973", + "201532269349302993", + "201502649349300710", + "201502649349300750", + "201502649349300800", + "201512629349300601", + "201512649349300006", + "201512649349300011", + "201522619349300007", + "201522619349300012", + "201522619349300127", + "201522679349300227", + "201522679349300232", + "201522679349300412", + "201522679349300517", + "201522679349300722", + "201532679349300028", + "201532679349300118", + "201502679349200720", + "201502679349200750", + "201522669349200002", + "201522669349200017", + "201522669349200022", + "201522669349200107", + "201522669349200117", + "201522669349200122", + "201522669349200202", + "201522669349200217", + "201522669349200222", + "201542659349200424", + "201542659349200504", + "201542659349200524", + "201542659349200529", + "201542659349200614", + "201542659349200619", + "201542659349200709", + "201542659349200714", + "201512659349200001", + "201512659349200006", + "201512659349200011", + "201512659349200021", + "201512659349200101", + "201512659349200111", + "201512659349200126", + "201512669349300431", + "201512669349300441", + "201512669349300446", + "201512669349300511", + "201512669349300531", + "201512669349300601", + "201512669349300621", + "201512669349300626", + "201522659349300722", + "201522659349300757", + "201522659349300762", + "201522659349300802", + "201522659349300812", + "201522659349300907", + "201522659349301002", + "201522659349301052", + "201512659349300041", + "201512659349300046", + "201512659349300101", + "201512659349300121", + "201512659349300126", + "201512659349300206", + "201512659349300216", + "201512659349300221", + "201512659349300236", + "201512659349300241", + "201512659349300301", + "201512659349300311", + "201512659349300326", + "201512659349300401", + "201512659349300416", + "201512659349300431", + "201522649349300547", + "201522649349300602", + "201542639349300204", + "201502659349300235", + "201502659349300300", + "201502659349300330", + "201502659349300415", + "201502659349300430", + "201502659349300545", + "201502659349300625", + "201512649349300661", + "201532299349202433", + "201532299349202508", + "201532299349202518", + "201532299349202533", + "201532299349202538", + "201532299349202568", + "201542279349200014", + "201542279349200139", + "201542279349200229", + "201542279349200324", + "201542279349200429", + "201542299349201924", + "201542299349201984", + "201542299349202154", + "201532299349202613", + "201532299349202618", + "201532299349202668", + "201532299349202678", + "201532299349202723", + "201532299349202763", + "201532299349202778", + "201542279349200634", + "201542279349200714", + "201542279349200724", + "201542279349200734", + "201542279349200809", + "201542299349202209", + "201542299349202259", + "201542299349202334", + "201542299349202359", + "201542299349202369", + "201542299349202379", + "201542299349202394", + "201532299349100018", + "201532299349100023", + "201532299349100028", + "201532299349100033", + "201532299349100038", + "201532299349100043", + "201532299349100048", + "201532299349100103", + "201532299349100108", + "201532269349303013", + "201532269349303018", + "201532299349302438", + "201532299349302453", + "201532299349302463", + "201532299349302468", + "201532299349302538", + "201532299349302553", + "201532299349304308", + "201532299349304318", + "201532299349304353", + "201532299349304423", + "201532299349304443", + "201532299349304463", + "201512399349300731", + "201512399349300741", + "201522389349300042", + "201522389349300047", + "201522389349300102", + "201522389349300107", + "201522389349300202", + "201522389349300207", + "201522389349300212", + "201522389349300242", + "201522389349300247", + "201522389349300307", + "201522389349300312", + "201522389349300317", + "201522389349300322", + "201522389349300327", + "201532369349300608", + "201532369349300658", + "201532369349300673", + "201532369349300683", + "201532369349300758", + "201532369349300803", + "201532269349303203", + "201532269349303233", + "201532299349300113", + "201532299349300128", + "201532299349302648", + "201532299349302703", + "201512659349200216", + "201522649349200317", + "201522649349200322", + "201522649349200332", + "201522649349200362", + "201522649349200367", + "201542619349200504", + "201542619349200509", + "201542619349200604", + "201542619349200704", + "201542619349200804", + "201542619349200904", + "201502669349300410", + "201502669349300420", + "201502669349300425", + "201502669349300440", + "201502669349300445", + "201502669349300500", + "201502669349300515", + "201502669349300525", + "201502669349300530", + "201502669349300535", + "201502669349300600", + "201502669349300605", + "201502669349300610", + "201502669349300615", + "201502669349300620", + "201502669349300630", + "201502669349300645", + "201502669349300800", + "201512379349300911", + "201512379349301006", + "201512379349301016", + "201512379349301021", + "201522669349300952", + "201522669349301057", + "201522669349301067", + "201522669349301102", + "201532669349300003", + "201532669349300008", + "201532669349300023", + "201532669349300108", + "201532669349300113", + "201512649349300681", + "201512649349300701", + "201522649349300027", + "201522649349300152", + "201522649349300167", + "201532619349300203", + "201532619349300213", + "201532619349300238", + "201532619349300303", + "201532619349300418", + "201532619349300513", + "201532619349300603", + "201542679349300919", + "201502639349200250", + "201512619349200001", + "201512619349200011", + "201512619349200101", + "201512619349200206", + "201512619349200211", + "201512619349200306", + "201512619349200311", + "201512619349200316", + "201512619349200321", + "201512619349200401", + "201512619349200411", + "201512619349200501", + "201512619349200601", + "201512619349200606", + "201512619349200701", + "201512619349200711", + "201512619349200801", + "201512619349200811", + "201532299349100113", + "201532299349100118", + "201532299349100123", + "201532299349100128", + "201532299349100133", + "201532299349100138", + "201532299349100203", + "201532299349100208", + "201532299349100213", + "201532299349100218", + "201532299349100223", + "201532299349100233", + "201532299349100238", + "201532299349100248", + "201532299349100303", + "201542269349100004", + "201542269349100014", + "201542299349101134", + "201542299349101154", + "201542299349101159", + "201542299349101169", + "201542299349101174", + "201542299349101184", + "201542299349101189", + "201542299349101209", + "201542299349101219", + "201542299349101224", + "201542299349101254", + "201542299349101259", + "201542299349101269", + "201542309349100004", + "201542309349100104", + "201542309349100109", + "201542309349100304", + "201542309349100314", + "201532299349300603", + "201532299349300613", + "201532299349300618", + "201532299349300633", + "201532299349300638", + "201532299349300648", + "201532299349300713", + "201532299349302723", + "201532299349302753", + "201532299349304498", + "201532299349304578", + "201532299349304603", + "201532299349304628", + "201532299349304638", + "201532299349304653", + "201532299349304668", + "201542269349301714", + "201542269349301719", + "201542269349301809", + "201542269349301814", + "201522389349300607", + "201522389349300612", + "201522389349300617", + "201522389349300622", + "201522389349300632", + "201522389349300637", + "201522389349300642", + "201522389349300647", + "201522389349300702", + "201522389349300707", + "201522389349300712", + "201522389349300717", + "201522389349300727", + "201522389349300732", + "201522389349300802", + "201522389349300822", + "201522389349300827", + "201522389349300902", + "201522389349300907", + "201522389349300917", + "201532379349300013", + "201522399349300032", +] diff --git a/irs_reader/settings.py b/irs_reader/settings.py index b20ea4a..08adf35 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -1,14 +1,15 @@ -import sys import os +import sys + from .dir_utils import mkdir_p IRS_READER_ROOT = os.path.abspath(os.path.dirname(__file__)) # This is the URL to amazon's bucket, could use another synced to it -IRS_XML_HTTP_BASE = "https://s3.amazonaws.com/irs-form-990" +IRS_XML_HTTP_BASE = "https://gt990datalake-rawdata.s3.amazonaws.com/EfileData/XmlFiles" # It can be hard to locate this. -IRSX_SETTINGS_LOCATION = (os.path.join(IRS_READER_ROOT, "settings.py")) +IRSX_SETTINGS_LOCATION = os.path.join(IRS_READER_ROOT, "settings.py") # Defaults to the same directory as this settings file, but you can override # with the `IRSX_CACHE_DIRECTORY` environment variable @@ -16,45 +17,103 @@ # The directory we put files in while we're processing them WORKING_DIRECTORY = os.environ.get( - "IRSX_WORKING_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "XML")) + "IRSX_WORKING_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "XML") +) # Helpful to keep these around for lookup purposes INDEX_DIRECTORY = os.environ.get( - "IRSX_INDEX_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "CSV")) + "IRSX_INDEX_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "CSV") +) IRS_INDEX_BASE = "https://apps.irs.gov/pub/epostcard/990/xml/%s/index_%s.csv" KNOWN_SCHEDULES = [ - "IRS990", "IRS990EZ", "IRS990PF", "IRS990ScheduleA", - "IRS990ScheduleB", "IRS990ScheduleC", "IRS990ScheduleD", - "IRS990ScheduleE", "IRS990ScheduleF", "IRS990ScheduleG", - "IRS990ScheduleH", "IRS990ScheduleI", "IRS990ScheduleJ", - "IRS990ScheduleK", "IRS990ScheduleL", "IRS990ScheduleM", - "IRS990ScheduleN", "IRS990ScheduleO", "IRS990ScheduleR", - "ReturnHeader990x" + "IRS990", + "IRS990EZ", + "IRS990PF", + "IRS990ScheduleA", + "IRS990ScheduleB", + "IRS990ScheduleC", + "IRS990ScheduleD", + "IRS990ScheduleE", + "IRS990ScheduleF", + "IRS990ScheduleG", + "IRS990ScheduleH", + "IRS990ScheduleI", + "IRS990ScheduleJ", + "IRS990ScheduleK", + "IRS990ScheduleL", + "IRS990ScheduleM", + "IRS990ScheduleN", + "IRS990ScheduleO", + "IRS990ScheduleR", + "ReturnHeader990x", ] -# these could get pushed to metadata directory? +# these could get pushed to metadata directory? ALLOWED_VERSIONSTRINGS = [ - '2013v3.0', '2013v3.1', '2013v4.0', - '2014v5.0', '2014v6.0', - '2015v2.0', '2015v2.1', '2015v3.0', - '2016v3.0', '2016v3.1', - '2017v2.0', '2017v2.1', '2017v2.2', '2017v2.3', - '2018v3.0', '2018v3.1', '2018v3.2', '2018v3.3', - '2019v5.0', '2019v5.1', '2019v5.2', - '2020v1.0', '2020v1.1','2020v1.2','2020v1.3', '2020v2.0', '2020v3.0', '2020v4.0','2020v4.1', '2020v4.2', - '2021v4.0','2021v4.1','2021v4.2','2021v4.3', - '2022v4.0','2022v4.1','2022v5.0', - '2022v6.0','2022v7.0', + "2013v3.0", + "2013v3.1", + "2013v4.0", + "2014v5.0", + "2014v6.0", + "2015v2.0", + "2015v2.1", + "2015v3.0", + "2016v3.0", + "2016v3.1", + "2017v2.0", + "2017v2.1", + "2017v2.2", + "2017v2.3", + "2018v3.0", + "2018v3.1", + "2018v3.2", + "2018v3.3", + "2019v5.0", + "2019v5.1", + "2019v5.2", + "2020v1.0", + "2020v1.1", + "2020v1.2", + "2020v1.3", + "2020v2.0", + "2020v3.0", + "2020v4.0", + "2020v4.1", + "2020v4.2", + "2021v4.0", + "2021v4.1", + "2021v4.2", + "2021v4.3", + "2022v4.0", + "2022v4.1", + "2022v5.0", + "2022v6.0", + "2022v7.0", # these are guesses for future 2023 schemas; they might not actually exist - '2023v1.0', - '2023v2.0', - '2023v3.0','2023v3.1','2023v3.2','2023v3.3', - '2023v4.0','2023v4.1','2023v4.2','2023v4.3', - '2023v5.0','2023v5.1','2023v5.2','2023v5.3', - '2023v6.0','2023v6.1','2023v6.2','2023v6.3', - '2023v7.0','2023v7.1','2023v7.2','2023v7.3', + "2023v1.0", + "2023v2.0", + "2023v3.0", + "2023v3.1", + "2023v3.2", + "2023v3.3", + "2023v4.0", + "2023v4.1", + "2023v4.2", + "2023v4.3", + "2023v5.0", + "2023v5.1", + "2023v5.2", + "2023v5.3", + "2023v6.0", + "2023v6.1", + "2023v6.2", + "2023v6.3", + "2023v7.0", + "2023v7.1", + "2023v7.2", + "2023v7.3", ] # 2020 is experimental @@ -63,15 +122,25 @@ # We can capture the group structure for these so it doesn't break # but these versions ARE NOT supported and aren't mapped to IRSx variables CSV_ALLOWED_VERSIONSTRINGS = ALLOWED_VERSIONSTRINGS + [ - '2010v3.2', '2010v3.4', '2010v3.6', '2010v3.7', '2011v1.2', '2011v1.3', - '2011v1.4', '2011v1.5', '2012v2.0', '2012v2.1', '2012v2.2', '2012v2.3', - '2012v3.0' + "2010v3.2", + "2010v3.4", + "2010v3.6", + "2010v3.7", + "2011v1.2", + "2011v1.3", + "2011v1.4", + "2011v1.5", + "2012v2.0", + "2012v2.1", + "2012v2.2", + "2012v2.3", + "2012v3.0", ] -METADATA_DIRECTORY = (os.path.join(IRS_READER_ROOT, "metadata")) +METADATA_DIRECTORY = os.path.join(IRS_READER_ROOT, "metadata") KEYERROR_LOG = os.path.join(IRS_READER_ROOT, "keyerrors.log") -LOG_KEY = 'xml' +LOG_KEY = "xml" mkdir_p([WORKING_DIRECTORY, INDEX_DIRECTORY]) diff --git a/irs_reader/sked_dict_reader.py b/irs_reader/sked_dict_reader.py index 5c2e722..264fa97 100644 --- a/irs_reader/sked_dict_reader.py +++ b/irs_reader/sked_dict_reader.py @@ -1,8 +1,14 @@ -from .type_utils import dictType, orderedDictType, listType, \ - unicodeType, noneType, strType from .flatten_utils import flatten from .keyerror_utils import ignorable_keyerror from .settings import LOG_KEY +from .type_utils import ( + dictType, + listType, + noneType, + orderedDictType, + strType, + unicodeType, +) class SkedDictReader(object): @@ -11,6 +17,7 @@ class SkedDictReader(object): into xpath-ed variables and repeated structures. Will also work on reading xmltodict that was previously turned into json """ + def __init__( self, standardizer, @@ -19,21 +26,21 @@ def __init__( ein, documentId=None, documentation=False, - csv_format=False + csv_format=False, ): self.standardizer = standardizer self.object_id = object_id self.ein = ein self.documentId = documentId - self.schedule_parts = {} # allows one entry per filing - self.repeating_groups = {} # multiple per filing - self.csv_format = csv_format # Do we need to generate ordered csv - self.for_csv_list = [] # keep record of elements, line by line + self.schedule_parts = {} # allows one entry per filing + self.repeating_groups = {} # multiple per filing + self.csv_format = csv_format # Do we need to generate ordered csv + self.for_csv_list = [] # keep record of elements, line by line self.groups = groups self.documentation = documentation - self.variable_keyerrors = [] # record any unexpected variables - self.group_keyerrors = [] # or unexpected groups + self.variable_keyerrors = [] # record any unexpected variables + self.group_keyerrors = [] # or unexpected groups if self.documentation and not self.standardizer.get_documentation_status(): # Todo: split out documenter entirely so we don't have to do this @@ -43,52 +50,49 @@ def __init__( ) def _get_table_start(self): - """ prefill the columns we need for all tables """ + """prefill the columns we need for all tables""" if self.documentation: standardized_table_start = { - 'object_id': { - 'value': self.object_id, - 'ordering': -1, - 'line_number': 'NA', - 'description': 'IRS-assigned object id', - 'db_type': 'String(18)' + "object_id": { + "value": self.object_id, + "ordering": -1, + "line_number": "NA", + "description": "IRS-assigned object id", + "db_type": "String(18)", + }, + "ein": { + "value": self.ein, + "ordering": -2, + "line_number": "NA", + "description": "IRS employer id number", + "db_type": "String(9)", }, - 'ein': { - 'value': self.ein, - 'ordering': -2, - 'line_number': 'NA', - 'description': 'IRS employer id number', - 'db_type': 'String(9)' - } } if self.documentId: - standardized_table_start['documentId'] = { - 'value': self.documentId, - 'description': 'Document ID', - 'ordering': 0 + standardized_table_start["documentId"] = { + "value": self.documentId, + "description": "Document ID", + "ordering": 0, } else: - standardized_table_start = { - 'object_id': self.object_id, - 'ein': self.ein - } + standardized_table_start = {"object_id": self.object_id, "ein": self.ein} if self.documentId: - standardized_table_start['documentId'] = self.documentId + standardized_table_start["documentId"] = self.documentId return standardized_table_start def _process_group(self, json_node, path, this_group): for node_index, node in enumerate(json_node): - #print("_process_group %s " % (this_group['db_name'])) + # print("_process_group %s " % (this_group['db_name'])) this_node_type = type(node) flattened_list_item = None if this_node_type == unicodeType: - #print("_pg: unicodeType %s ") + # print("_pg: unicodeType %s ") flattened_list_item = {path: node} else: - #print("_pg: NOT unicodeType") - flattened_list_item = flatten(node, parent_key=path, sep='/') + # print("_pg: NOT unicodeType") + flattened_list_item = flatten(node, parent_key=path, sep="/") table_name = None standardized_group_dict = self._get_table_start() @@ -101,11 +105,11 @@ def _process_group(self, json_node, path, this_group): if self.csv_format: this_var = { - 'xpath':xpath, - 'value':value, - 'in_group':True, - 'group_name':this_group['db_name'], - 'group_index':node_index + "xpath": xpath, + "value": value, + "in_group": True, + "group_name": this_group["db_name"], + "group_index": node_index, } self.for_csv_list.append(this_var) @@ -113,20 +117,18 @@ def _process_group(self, json_node, path, this_group): this_var_data = self.standardizer.get_var(xpath) except KeyError: if not ignorable_keyerror(xpath): - self.variable_keyerrors.append( - {'element_path':xpath} - ) + self.variable_keyerrors.append({"element_path": xpath}) continue this_var_value = flattened_list_item[xpath] - this_var_name = this_var_data['db_name'] - table_name = this_var_data['db_table'] + this_var_name = this_var_data["db_name"] + table_name = this_var_data["db_table"] if self.documentation: result = { - 'value': this_var_value, - 'ordering': this_var_data['ordering'], - 'line_number': this_var_data['line_number'], - 'description': this_var_data['description'], - 'db_type': this_var_data['db_type'] + "value": this_var_value, + "ordering": this_var_data["ordering"], + "line_number": this_var_data["line_number"], + "description": this_var_data["description"], + "db_type": this_var_data["db_type"], } standardized_group_dict[this_var_name] = result @@ -142,15 +144,13 @@ def _parse_json(self, json_node, parent_path=""): element_path = parent_path if this_node_type == listType: - #print("List type %s" % element_path) + # print("List type %s" % element_path) this_group = None try: this_group = self.groups[element_path] except KeyError: - self.group_keyerrors.append( - {'element_path':element_path} - ) + self.group_keyerrors.append({"element_path": element_path}) self._process_group(json_node, parent_path, this_group) elif this_node_type == unicodeType: @@ -162,23 +162,19 @@ def _parse_json(self, json_node, parent_path=""): try: # is it a group? this_group = self.groups[element_path] - self._process_group( - [{parent_path: json_node}], - '', - this_group - ) + self._process_group([{parent_path: json_node}], "", this_group) except KeyError: # It's not a group so it should be a variable we know about - + if self.csv_format: this_var = { - 'xpath':element_path, - 'value':json_node, - 'in_group':False, - 'group_name':None, - 'group_index':None + "xpath": element_path, + "value": json_node, + "in_group": False, + "group_name": None, + "group_index": None, } self.for_csv_list.append(this_var) @@ -191,24 +187,22 @@ def _parse_json(self, json_node, parent_path=""): # pass through for some common key errors # [ TODO: FIX THE KEYERRORS! ] if not ignorable_keyerror(element_path): - self.variable_keyerrors.append( - {'element_path':element_path} - ) + self.variable_keyerrors.append({"element_path": element_path}) var_found = False if var_found: - table_name = var_data['db_table'] - var_name = var_data['db_name'] + table_name = var_data["db_table"] + var_name = var_data["db_name"] result = json_node if self.documentation: result = { - 'value': json_node, - 'ordering': var_data['ordering'], - 'line_number': var_data['line_number'], - 'description': var_data['description'], - 'db_type': var_data['db_type'] + "value": json_node, + "ordering": var_data["ordering"], + "line_number": var_data["line_number"], + "description": var_data["description"], + "db_type": var_data["db_type"], } try: @@ -217,13 +211,12 @@ def _parse_json(self, json_node, parent_path=""): self.schedule_parts[table_name] = self._get_table_start() self.schedule_parts[table_name][var_name] = result - elif this_node_type == orderedDictType or this_node_type == dictType: try: # is it a singleton group? this_group = self.groups[element_path] - self._process_group([{parent_path: json_node}], '', this_group) + self._process_group([{parent_path: json_node}], "", this_group) except KeyError: keys = json_node.keys() @@ -236,16 +229,16 @@ def _parse_json(self, json_node, parent_path=""): elif this_node_type == strType: msg = "String '%s'" % json_node - #self.logging.debug(msg) + # self.logging.debug(msg) else: raise Exception("Unhandled type: %s" % (type(json_node))) def parse(self, raw_ordered_dict, parent_path=""): self._parse_json(raw_ordered_dict, parent_path=parent_path) - return ({ - 'schedule_parts': self.schedule_parts, - 'groups': self.repeating_groups, - 'csv_line_array':self.for_csv_list, # This is empty if not csv - 'keyerrors':self.variable_keyerrors, - 'group_keyerrors':self.group_keyerrors - }) + return { + "schedule_parts": self.schedule_parts, + "groups": self.repeating_groups, + "csv_line_array": self.for_csv_list, # This is empty if not csv + "keyerrors": self.variable_keyerrors, + "group_keyerrors": self.group_keyerrors, + } diff --git a/irs_reader/standardizer.py b/irs_reader/standardizer.py index e467d31..9de0b9c 100644 --- a/irs_reader/standardizer.py +++ b/irs_reader/standardizer.py @@ -1,11 +1,14 @@ +import collections import os import sys -import collections -#import logging + +# import logging from datetime import datetime -from .settings import METADATA_DIRECTORY, KEYERROR_LOG + +from .settings import KEYERROR_LOG, METADATA_DIRECTORY from .sked_dict_reader import SkedDictReader from .type_utils import listType + if sys.version_info >= (3, 0): import csv else: @@ -15,40 +18,38 @@ class Standardizer(object): """ This reads metadata .csv files, which it uses to standardize - ordered dicts. For documentation, see Documentizer below. + ordered dicts. For documentation, see Documentizer below. """ def __init__(self): - #self.show_documentation = documentation + # self.show_documentation = documentation self.groups = {} self.variables = {} self.schedule_parts = {} # This is overridden for Documentizer class below - self.variable_columns =['db_table', 'db_name'] + self.variable_columns = ["db_table", "db_name"] self._make_groups() self._make_variables() - def _make_groups(self): - group_filepath = os.path.join(METADATA_DIRECTORY, 'groups.csv') - with open(group_filepath, 'r') as reader_fh: + group_filepath = os.path.join(METADATA_DIRECTORY, "groups.csv") + with open(group_filepath, "r") as reader_fh: reader = csv.DictReader(reader_fh) for row in reader: - self.groups[row['xpath']] = row + self.groups[row["xpath"]] = row return True def _make_variables(self): - variable_filepath = os.path.join(METADATA_DIRECTORY, 'variables.csv') - with open(variable_filepath, 'r') as variable_fh: + variable_filepath = os.path.join(METADATA_DIRECTORY, "variables.csv") + with open(variable_filepath, "r") as variable_fh: reader = csv.DictReader(variable_fh) for row in reader: vardict = {} for col in self.variable_columns: - vardict[col]=row[col] - self.variables[row['xpath']] = vardict - + vardict[col] = row[col] + self.variables[row["xpath"]] = vardict return True @@ -58,27 +59,35 @@ def get_groups(self): def get_var(self, var_xpath, version=None): if version: raise Exception("Version checking is not implemented") - return (self.variables[var_xpath]) + return self.variables[var_xpath] def get_documentation_status(self): return False class Documentizer(Standardizer): - """ Like Standardizer, but returns canonical documentation info from 2016 version """ + """Like Standardizer, but returns canonical documentation info from 2016 version""" def __init__(self, versions=False): self.groups = {} self.variables = {} self.schedule_parts = {} - self.variable_columns =[ - 'db_table', 'db_name', 'ordering', - 'line_number', 'description', 'db_type', - 'irs_type', 'xpath' + self.variable_columns = [ + "db_table", + "db_name", + "ordering", + "line_number", + "description", + "db_type", + "irs_type", + "xpath", ] if versions: - self.variable_columns = self.variable_columns + ['version_start', 'version_end'] + self.variable_columns = self.variable_columns + [ + "version_start", + "version_end", + ] self._make_schedule_parts() self._make_groups() @@ -88,17 +97,16 @@ def get_documentation_status(self): return True def _make_schedule_parts(self): - part_filepath = os.path.join(METADATA_DIRECTORY, 'schedule_parts.csv') - with open(part_filepath, 'r') as reader_fh: + part_filepath = os.path.join(METADATA_DIRECTORY, "schedule_parts.csv") + with open(part_filepath, "r") as reader_fh: reader = csv.DictReader(reader_fh) for row in reader: - self.schedule_parts[row['parent_sked_part']] = { - 'name': row['part_name'], - 'ordering': row['ordering'], - 'parent_sked': row['parent_sked'], - 'parent_sked_part': row['parent_sked_part'], - 'is_shell': row['is_shell'] - + self.schedule_parts[row["parent_sked_part"]] = { + "name": row["part_name"], + "ordering": row["ordering"], + "parent_sked": row["parent_sked"], + "parent_sked_part": row["parent_sked_part"], + "is_shell": row["is_shell"], } return True @@ -107,29 +115,29 @@ def get_schedule_parts(self): def part_ordering(self, partname): try: - result = int(self.schedule_parts[partname]['ordering']) + result = int(self.schedule_parts[partname]["ordering"]) return result except KeyError: return None def group_ordering(self, groupname): try: - return self.groups[groupname]['ordering'] + return self.groups[groupname]["ordering"] except KeyError: return None def get_groups_by_sked(self, sked): groups = [] for thisgroup in self.groups.keys(): - if self.groups[thisgroup]['parent_sked'] == sked: + if self.groups[thisgroup]["parent_sked"] == sked: groups.append(self.groups[thisgroup]) return groups def get_parts_by_sked(self, sked): parts = [] for thispart in self.schedule_parts.keys(): - #print(self.schedule_parts[thispart]) - if self.schedule_parts[thispart]['parent_sked'] == sked: + # print(self.schedule_parts[thispart]) + if self.schedule_parts[thispart]["parent_sked"] == sked: parts.append(self.schedule_parts[thispart]) return parts @@ -137,7 +145,6 @@ def get_variables(self): return self.variables - class VersionDocumentizer(object): """ Returns version-specific line number and documentation. @@ -158,31 +165,30 @@ def check_version(self, versionstring, start_year, end_year): return result def _make_line_numbers(self): - filepath = os.path.join(METADATA_DIRECTORY, 'line_numbers.csv') - with open(filepath, 'r') as reader_fh: + filepath = os.path.join(METADATA_DIRECTORY, "line_numbers.csv") + with open(filepath, "r") as reader_fh: reader = csv.DictReader(reader_fh) for row in reader: try: - self.line_numbers[row['xpath']] - self.line_numbers[row['xpath']].append(row) + self.line_numbers[row["xpath"]] + self.line_numbers[row["xpath"]].append(row) except KeyError: - self.line_numbers[row['xpath']] = [row] + self.line_numbers[row["xpath"]] = [row] def _make_descriptions(self): - filepath = os.path.join(METADATA_DIRECTORY, 'descriptions.csv') - with open(filepath, 'r') as reader_fh: + filepath = os.path.join(METADATA_DIRECTORY, "descriptions.csv") + with open(filepath, "r") as reader_fh: reader = csv.DictReader(reader_fh) for row in reader: try: - self.descriptions[row['xpath']] - self.descriptions[row['xpath']].append(row) + self.descriptions[row["xpath"]] + self.descriptions[row["xpath"]].append(row) except KeyError: - self.descriptions[row['xpath']] = [row] - + self.descriptions[row["xpath"]] = [row] def get_line_number(self, xpath, version_string): candidate_rows = [] @@ -192,8 +198,10 @@ def get_line_number(self, xpath, version_string): return None for row in candidate_rows: - if self.check_version(version_string, row['version_start'], row['version_end']): - return row['line_number'] + if self.check_version( + version_string, row["version_start"], row["version_end"] + ): + return row["line_number"] return None @@ -204,7 +212,8 @@ def get_description(self, xpath, version_string): except KeyError: return None for row in candidate_rows: - if self.check_version(version_string, row['version_start'], row['version_end']): - return row['description'] + if self.check_version( + version_string, row["version_start"], row["version_end"] + ): + return row["description"] return None - diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 515c4ba..6c22a1f 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -1,33 +1,36 @@ -import json -import sys import codecs -import re import csv +import json +import re +import sys + import unicodecsv - -from .standardizer import Standardizer, Documentizer, VersionDocumentizer +from .standardizer import Documentizer, Standardizer, VersionDocumentizer -BRACKET_RE = re.compile(r'\[.*?\]') +BRACKET_RE = re.compile(r"\[.*?\]") ASTERISKS = "****************" + def debracket(string): - """ Eliminate the bracketed var names in doc, line strings """ - result = re.sub(BRACKET_RE, ';', str(string)) - result = result.lstrip(';') - result = result.lstrip(' ') - result = result.replace('; ;',';') + """Eliminate the bracketed var names in doc, line strings""" + result = re.sub(BRACKET_RE, ";", str(string)) + result = result.lstrip(";") + result = result.lstrip(" ") + result = result.replace("; ;", ";") return result + def most_recent(semicolon_delimited_string): result = semicolon_delimited_string.split(";")[-1] return result + def to_json(data, outfilepath=None): if data: if outfilepath: - with open(outfilepath, 'w') as outfile: + with open(outfilepath, "w") as outfile: json.dump(data, outfile) else: if hasattr(sys.stdout, "buffer"): @@ -36,117 +39,139 @@ def to_json(data, outfilepath=None): else: json.dump(data, sys.stdout) -def to_csv(parsed_filing, object_id=None, standardizer=None, documentation=True, vd=None, outfilepath=None): + +def to_csv( + parsed_filing, + object_id=None, + standardizer=None, + documentation=True, + vd=None, + outfilepath=None, +): if not vd: vd = VersionDocumentizer() - stdout = getattr(sys.stdout, 'buffer', sys.stdout) + stdout = getattr(sys.stdout, "buffer", sys.stdout) if outfilepath: - stdout = open(outfilepath, 'wb') # or 'wb' ? + stdout = open(outfilepath, "wb") # or 'wb' ? fieldnames = [] - fieldnames = [ - 'object_id', 'form', 'line_number', 'description', 'value', 'variable_name', - 'xpath', 'in_group', 'group_name', 'group_index' - ] + fieldnames = [ + "object_id", + "form", + "line_number", + "description", + "value", + "variable_name", + "xpath", + "in_group", + "group_name", + "group_index", + ] writer = unicodecsv.DictWriter( - stdout, - fieldnames=fieldnames, - encoding='utf-8', - quoting=csv.QUOTE_MINIMAL + stdout, fieldnames=fieldnames, encoding="utf-8", quoting=csv.QUOTE_MINIMAL ) - writer.writeheader() # this fails in python3? + writer.writeheader() # this fails in python3? results = parsed_filing.get_result() if results: for result in results: - for this_result in result['csv_line_array']: + for this_result in result["csv_line_array"]: vardata = None try: - vardata = standardizer.get_var(this_result['xpath']) + vardata = standardizer.get_var(this_result["xpath"]) except KeyError: pass if vardata: - this_result['variable_name'] = vardata['db_table'] + "." + vardata['db_name'] + this_result["variable_name"] = ( + vardata["db_table"] + "." + vardata["db_name"] + ) raw_line_num = vd.get_line_number( - this_result['xpath'], - parsed_filing.get_version() + this_result["xpath"], parsed_filing.get_version() ) - this_result['line_number'] = debracket(raw_line_num) + this_result["line_number"] = debracket(raw_line_num) raw_description = vd.get_description( - this_result['xpath'], - parsed_filing.get_version() + this_result["xpath"], parsed_filing.get_version() ) - this_result['description'] = debracket(raw_description) - this_result['form'] = this_result['xpath'].split("/")[1] - this_result['object_id'] = object_id + this_result["description"] = debracket(raw_description) + this_result["form"] = this_result["xpath"].split("/")[1] + this_result["object_id"] = object_id writer.writerow(this_result) -def to_txt(parsed_filing, standardizer=None, documentation=True, vd=None, outfilepath=None): +def to_txt( + parsed_filing, standardizer=None, documentation=True, vd=None, outfilepath=None +): if not vd: vd = VersionDocumentizer() results = parsed_filing.get_result() this_sked_name = None if outfilepath: - outfile = open(outfilepath, 'w') + outfile = open(outfilepath, "w") if results: for result in results: - for this_result in result['csv_line_array']: + for this_result in result["csv_line_array"]: #### Collect the variables we need vardata = None - textoutput = "\n" # This is what we'll eventually write out - this_result['form'] = this_result['xpath'].split("/")[1] + textoutput = "\n" # This is what we'll eventually write out + this_result["form"] = this_result["xpath"].split("/")[1] try: - vardata = standardizer.get_var(this_result['xpath']) + vardata = standardizer.get_var(this_result["xpath"]) except KeyError: pass if vardata: - this_result['variable_name'] = vardata['db_table'] + "." + vardata['db_name'] + this_result["variable_name"] = ( + vardata["db_table"] + "." + vardata["db_name"] + ) raw_line_num = vd.get_line_number( - this_result['xpath'], - parsed_filing.get_version() + this_result["xpath"], parsed_filing.get_version() ) - this_result['line_number'] = debracket(raw_line_num) + this_result["line_number"] = debracket(raw_line_num) raw_description = vd.get_description( - this_result['xpath'], - parsed_filing.get_version() + this_result["xpath"], parsed_filing.get_version() ) - this_result['description'] = debracket(raw_description) - - #### Write the output, now that we've got the vars - - if this_sked_name != this_result['form']: - textoutput += "\n\n\n" + ASTERISKS + "\tSchedule %s\n" % this_result['form'] - this_sked_name = this_result['form'] - - textoutput += "\n" + ASTERISKS + "\n Value: '%s'\nForm: %s\nLine:%s\nDescription:%s" % ( - this_result['value'], - this_result['form'], - this_result['line_number'], - this_result['description'], + this_result["description"] = debracket(raw_description) + + #### Write the output, now that we've got the vars + + if this_sked_name != this_result["form"]: + textoutput += ( + "\n\n\n" + ASTERISKS + "\tSchedule %s\n" % this_result["form"] + ) + this_sked_name = this_result["form"] + + textoutput += ( + "\n" + + ASTERISKS + + "\n Value: '%s'\nForm: %s\nLine:%s\nDescription:%s" + % ( + this_result["value"], + this_result["form"], + this_result["line_number"], + this_result["description"], + ) ) if documentation: - textoutput += "\nXpath:%s" % (this_result['xpath']) + textoutput += "\nXpath:%s" % (this_result["xpath"]) - if this_result['in_group']: + if this_result["in_group"]: textoutput += "\nGroup: %s group_index %s" % ( - this_result['group_name'], - this_result['group_index'] + this_result["group_name"], + this_result["group_index"], ) else: textoutput += "\nGroup:" - + if outfilepath: outfile.write(textoutput) else: sys.stdout.write(textoutput) if outfilepath: - outfile.close() \ No newline at end of file + outfile.close() diff --git a/irs_reader/type_utils.py b/irs_reader/type_utils.py index 7f5fc38..7c6d86d 100644 --- a/irs_reader/type_utils.py +++ b/irs_reader/type_utils.py @@ -6,6 +6,6 @@ dictType = type(dict()) orderedDictType = type(OrderedDict()) listType = type(list()) -unicodeType = type(u'') +unicodeType = type("") noneType = type(None) -strType = type('') +strType = type("") diff --git a/irs_reader/xmlrunner.py b/irs_reader/xmlrunner.py index d8ca396..e25eea5 100644 --- a/irs_reader/xmlrunner.py +++ b/irs_reader/xmlrunner.py @@ -1,21 +1,27 @@ from .filing import Filing -from .standardizer import Standardizer, Documentizer, VersionDocumentizer +from .settings import ( + ALLOWED_VERSIONSTRINGS, + CSV_ALLOWED_VERSIONSTRINGS, + WORKING_DIRECTORY, +) from .sked_dict_reader import SkedDictReader +from .standardizer import Documentizer, Standardizer, VersionDocumentizer + # from .log_utils import configure_logging from .type_utils import listType -from .settings import WORKING_DIRECTORY, ALLOWED_VERSIONSTRINGS, CSV_ALLOWED_VERSIONSTRINGS class XMLRunner(object): - """ Load a Standardizer just once while running multiple filings - Return Filing objects with results, keyerrors set + """Load a Standardizer just once while running multiple filings + Return Filing objects with results, keyerrors set """ + def __init__(self, documentation=False, standardizer=None, csv_format=False): self.documentation = documentation self.csv_format = csv_format if documentation: - if not standardizer: + if not standardizer: self.standardizer = Documentizer() else: if standardizer: @@ -30,10 +36,10 @@ def get_standardizer(self): return self.standardizer def _run_schedule_k(self, sked, object_id, sked_dict, path_root, ein): - assert sked == 'IRS990ScheduleK' + assert sked == "IRS990ScheduleK" if type(sked_dict) == listType: for individual_sked in sked_dict: - doc_id = individual_sked['@documentId'] + doc_id = individual_sked["@documentId"] reader = SkedDictReader( self.standardizer, self.group_dicts, @@ -45,13 +51,14 @@ def _run_schedule_k(self, sked, object_id, sked_dict, path_root, ein): ) result = reader.parse(individual_sked, parent_path=path_root) - self.whole_filing_data.append({ - 'schedule_name': sked, - 'groups': result['groups'], - 'schedule_parts': result['schedule_parts'], - 'csv_line_array':result['csv_line_array'] - - }) + self.whole_filing_data.append( + { + "schedule_name": sked, + "groups": result["groups"], + "schedule_parts": result["schedule_parts"], + "csv_line_array": result["csv_line_array"], + } + ) else: reader = SkedDictReader( self.standardizer, @@ -63,17 +70,19 @@ def _run_schedule_k(self, sked, object_id, sked_dict, path_root, ein): ) result = reader.parse(sked_dict, parent_path=path_root) - self.whole_filing_data.append({ - 'schedule_name': sked, - 'groups': result['groups'], - 'schedule_parts': result['schedule_parts'], - 'csv_line_array':result['csv_line_array'] - }) + self.whole_filing_data.append( + { + "schedule_name": sked, + "groups": result["groups"], + "schedule_parts": result["schedule_parts"], + "csv_line_array": result["csv_line_array"], + } + ) def _run_schedule(self, sked, object_id, sked_dict, ein): path_root = "/" + sked # Only sked K (bonds) is allowed to repeat - if sked == 'IRS990ScheduleK': + if sked == "IRS990ScheduleK": self._run_schedule_k(sked, object_id, sked_dict, path_root, ein) else: @@ -85,24 +94,27 @@ def _run_schedule(self, sked, object_id, sked_dict, ein): documentation=self.documentation, csv_format=self.csv_format, ) - if sked == 'ReturnHeader990x': + if sked == "ReturnHeader990x": path_root = "/ReturnHeader" result = reader.parse(sked_dict, parent_path=path_root) - self.whole_filing_data.append({ - 'schedule_name': sked, - 'groups': result['groups'], - 'schedule_parts': result['schedule_parts'], - 'csv_line_array':result['csv_line_array'] - - }) + self.whole_filing_data.append( + { + "schedule_name": sked, + "groups": result["groups"], + "schedule_parts": result["schedule_parts"], + "csv_line_array": result["csv_line_array"], + } + ) - if len(result['group_keyerrors']) > 0 or len(result['keyerrors'])> 0: - self.filing_keyerr_data.append({ - 'schedule_name': sked, - 'group_keyerrors':result['group_keyerrors'], - 'keyerrors':result['keyerrors'], - }) + if len(result["group_keyerrors"]) > 0 or len(result["keyerrors"]) > 0: + self.filing_keyerr_data.append( + { + "schedule_name": sked, + "group_keyerrors": result["group_keyerrors"], + "keyerrors": result["keyerrors"], + } + ) def run_filing(self, object_id, verbose=False): self.whole_filing_data = [] @@ -112,7 +124,9 @@ def run_filing(self, object_id, verbose=False): this_version = this_filing.get_version() if verbose: print("Filing %s is version %s" % (object_id, this_version)) - if this_version in ALLOWED_VERSIONSTRINGS or ( self.csv_format and this_version in CSV_ALLOWED_VERSIONSTRINGS ): + if this_version in ALLOWED_VERSIONSTRINGS or ( + self.csv_format and this_version in CSV_ALLOWED_VERSIONSTRINGS + ): this_version = this_filing.get_version() schedules = this_filing.list_schedules() ein = this_filing.get_ein() @@ -123,14 +137,16 @@ def run_filing(self, object_id, verbose=False): this_filing.set_result(self.whole_filing_data) this_filing.set_keyerrors(self.filing_keyerr_data) - if verbose and not self.csv_format: # csv format works on years with many, many keyerrors, - if len(self.filing_keyerr_data)>0: + if ( + verbose and not self.csv_format + ): # csv format works on years with many, many keyerrors, + if len(self.filing_keyerr_data) > 0: print("In %s keyerrors: %s" % (object_id, self.filing_keyerr_data)) else: print("No keyerrors found") return this_filing else: - print("Filing version %s isn't supported for this operation" % this_version ) + print("Filing version %s isn't supported for this operation" % this_version) return this_filing """ @@ -157,7 +173,6 @@ def run_from_filing_obj(self, this_filing, verbose=False): return this_filing """ - def run_sked(self, object_id, sked, verbose=False): """ sked is the proper name of the schedule: @@ -168,7 +183,9 @@ def run_sked(self, object_id, sked, verbose=False): this_filing = Filing(object_id) this_filing.process(verbose=verbose) this_version = this_filing.get_version() - if this_version in ALLOWED_VERSIONSTRINGS or ( self.csv_format and this_version in CSV_ALLOWED_VERSIONSTRINGS ): + if this_version in ALLOWED_VERSIONSTRINGS or ( + self.csv_format and this_version in CSV_ALLOWED_VERSIONSTRINGS + ): this_version = this_filing.get_version() ein = this_filing.get_ein() sked_dict = this_filing.get_schedule(sked) @@ -178,5 +195,5 @@ def run_sked(self, object_id, sked, verbose=False): this_filing.set_keyerrors(self.filing_keyerr_data) return this_filing else: - print("Filing version %s isn't supported for this operation" % this_version ) + print("Filing version %s isn't supported for this operation" % this_version) return this_filing diff --git a/metadata b/metadata deleted file mode 160000 index fbc5b8f..0000000 --- a/metadata +++ /dev/null @@ -1 +0,0 @@ -Subproject commit fbc5b8f3f7baa2c611ed653d5429c93a8a9aa609 diff --git a/setup.py b/setup.py index b8063c7..1cba3bb 100644 --- a/setup.py +++ b/setup.py @@ -1,44 +1,57 @@ from distutils.core import setup import os -NAME = 'irsx' -HUMAN_NAME = 'irsx' +NAME = "irsx" +HUMAN_NAME = "irsx" HERE = os.path.abspath(os.path.dirname(__file__)) version_ns = {} -with open(os.path.join(HERE, 'irs_reader', '_version.py')) as f: +with open(os.path.join(HERE, "irs_reader", "_version.py")) as f: exec(f.read(), {}, version_ns) -setup(name=HUMAN_NAME, - description = "Turn the IRS' versioned XML 990's into python objects \ +setup( + name=HUMAN_NAME, + description="Turn the IRS' versioned XML 990's into python objects \ with original line number and description.", - version = version_ns['__version__'], - author = 'Jacob Fenton', - author_email = 'jsfenfen@gmail.com', - url = 'https://github.com/jsfenfen/990-xml-reader', - license = 'MIT', - setup_requires = ["setuptools", ], - install_requires = ['requests', 'xmltodict', 'unicodecsv'], - tests_require = ['nose', 'requests', 'xmltodict', 'unicodecsv', 'tox', 'tox-pyenv',], - packages = ['irsx'], - package_dir = {'irsx': 'irs_reader'}, - package_data = {'irsx': ['metadata/*.csv']}, - keywords = ['990', 'nonprofit', 'tax'], - entry_points = { - "console_scripts": ["irsx=irsx.irsx_cli:main", - "irsx_index=irsx.irsx_index_cli:main", - "irsx_retrieve=irsx.irsx_retrieve_cli:main"] - }, - classifiers=[ - # How mature is this project? Common values are - # 3 - Alpha - # 4 - Beta - # 5 - Production/Stable - 'Development Status :: 3 - Alpha', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - - ], - ) + version=version_ns["__version__"], + author="Jacob Fenton", + author_email="jsfenfen@gmail.com", + url="https://github.com/jsfenfen/990-xml-reader", + license="MIT", + setup_requires=[ + "setuptools", + ], + install_requires=["requests", "xmltodict", "unicodecsv"], + extras_require={ + "tests": [ + "nose", + "requests", + "xmltodict", + "unicodecsv", + "tox", + "tox-pyenv", + ] + }, + packages=["irsx"], + package_dir={"irsx": "irs_reader"}, + package_data={"irsx": ["metadata/*.csv"]}, + keywords=["990", "nonprofit", "tax"], + entry_points={ + "console_scripts": [ + "irsx=irsx.irsx_cli:main", + "irsx_index=irsx.irsx_index_cli:main", + "irsx_retrieve=irsx.irsx_retrieve_cli:main", + ] + }, + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + ], +) diff --git a/tests.py b/tests.py deleted file mode 100644 index 9342ba6..0000000 --- a/tests.py +++ /dev/null @@ -1,242 +0,0 @@ -import os -import json -from unittest import TestCase - -from irs_reader.file_utils import validate_object_id -from irs_reader.filing import Filing -from irs_reader.settings import WORKING_DIRECTORY, ALLOWED_VERSIONSTRINGS -from irs_reader.standardizer import Standardizer -from irs_reader.sked_dict_reader import SkedDictReader -from irs_reader.type_utils import listType -from irs_reader.xmlrunner import XMLRunner - - - -## Tests need to be reworked following IRS change in release. - - -# some test ids -from irs_reader.object_ids import object_ids_2017, \ - object_ids_2016, object_ids_2015 - -# For running cli stuff -from irs_reader.irsx_cli import run_main as run_cli_main, \ - get_parser as get_cli_parser -from irs_reader.irsx_index_cli import run_cli_index_main, \ - get_cli_index_parser - - -# FILING_2015V21 = '201642229349300909' -# FILING_2015V21_skeds = [ -# 'ReturnHeader990x', 'IRS990', 'IRS990ScheduleA', -# 'IRS990ScheduleB', 'IRS990ScheduleD', 'IRS990ScheduleM', -# 'IRS990ScheduleO' -# ] - -# # SUTTER HEALTH SACRAMENTO REGION 2014 filing has multiple schedule K's. -# FILING_2014V50 = '201533089349301428' - -# FILING_2014V50_skeds = [ -# 'ReturnHeader990x', 'IRS990', 'IRS990ScheduleA', 'IRS990ScheduleB', -# 'IRS990ScheduleC', 'IRS990ScheduleD', 'IRS990ScheduleG', -# 'IRS990ScheduleH', 'IRS990ScheduleI', 'IRS990ScheduleJ', -# 'IRS990ScheduleK', 'IRS990ScheduleL', 'IRS990ScheduleM', -# 'IRS990ScheduleO', 'IRS990ScheduleR' -# ] - -FILING_2022 = '202210409349301026' - -# don't bother testing every filing in tests -TEST_DEPTH = 10 - -# When set to false don't test download files that are already there. -# Runs faster set to off! -DOWNLOAD = False - - -def test_valid_object_id(): - result = validate_object_id(FILING_2022) - - -def test_process_from_id_only(): - a = Filing(FILING_2022) - a.process() - - -# def test_process_from_id_only_2(): -# a = Filing(FILING_2014V50) -# a.process() -# assert a.get_version() == '2014v5.0' - - -# def test_process_with_filepath(): -# filename = "%s_public.xml" % FILING_2015V21 -# filepath = os.path.join(WORKING_DIRECTORY, filename) -# a = Filing(FILING_2015V21, filepath=filepath) -# a.process() -# assert a.get_version() == '2015v2.1' - - -# test without runner -class TestConversion: - """ Still doesn't validate actual values, but... """ - - def setUp(self): - self.xml_runner = XMLRunner() - - def test_case_1(self): - parsed_filing = self.xml_runner.run_filing(FILING_2022) - - # def test_case_2(self): - # object_ids = object_ids_2017[:TEST_DEPTH] \ - # + object_ids_2016[:TEST_DEPTH] + object_ids_2015[:TEST_DEPTH] - # for object_id in object_ids: - # self.xml_runner.run_filing(object_id) - -# class TestRunner: -# """ Test using runner class """ - -# def setUp(self): -# self.xml_runner = XMLRunner() - -# def test1(self): -# parsed_filing = self.xml_runner.run_filing(FILING_2022) -# assert parsed_filing.get_type()=='IRS990' -# parsed_filing_schedules = parsed_filing.list_schedules() -# for sked in FILING_2015V21_skeds: -# assert sked in parsed_filing_schedules -# parsed_filing.get_parsed_sked(sked) - -# def test_multiple_sked_ks(self): -# parsed_filing = self.xml_runner.run_filing(FILING_2014V50) -# assert parsed_filing.get_type()=='IRS990' -# parsed_filing_schedules = parsed_filing.list_schedules() -# for sked in FILING_2014V50_skeds: -# assert sked in parsed_filing_schedules -# parsed_filing.get_parsed_sked(sked) -# def test_with_standardizer(self): -# standardizer = Standardizer() -# self.xml_runner = XMLRunner(standardizer=standardizer) - - -# class TestWithDownload: -# def setUp(self): -# self.filing = Filing(FILING_2015V21) -# if os.path.isfile(self.filing.get_filepath()): -# if DOWNLOAD: -# os.remove(self.filing.get_filepath()) - -# def test_case_1(self): -# self.filing.process() -# assert self.filing.get_version() == '2015v2.1' - -# def test_case_2(self): -# self.filing.process() -# f_skeds = self.filing.list_schedules() -# assert f_skeds == FILING_2015V21_skeds -# for f_sked in f_skeds: -# self.filing.get_schedule(f_sked) - - -class TestCommandLine: - def setUp(self): - parser = get_cli_parser() - self.parser = parser - - def test_cli_1(self): - args = self.parser.parse_args([FILING_2022, '--verbose']) - # Does it run? Output is to std out. - run_cli_main(args) - - def test_cli_2(self): - # dump only main 990 in bare json format - test_args = ['--schedule', 'IRS990', '--xpath', '202210409349301026'] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - def test_cli_3(self): - test_args = ['--schedule', 'IRS990', FILING_2022] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - """Testing the csv option without file set somehow breaks - it seems like it's some interaction between how nose handles output - and how we're outputting? Point is, the script works when the test fails. - So only test with the --file output option... - """ - def test_cli_4(self): - test_args = [ - '--schedule', 'IRS990', - '--format', 'csv', - '--file', 'testout.csv', - '202210409349301026' - ] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - - def test_cli_5(self): - test_args = [ - '--schedule', 'IRS990', - '--format', 'txt', - '--file','testout.csv', - '--verbose', - '202210409349301026' - ] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - def test_cli_6(self): - test_args = [ - '--format', 'txt', - '202210409349301026' - ] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - def test_cli_7(self): - test_args = [ - '--format', 'txt', - '--xpath', - '--verbose', - '202210409349301026' - ] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - def test_cli_8(self): - test_args = [ - '--list_schedules', - '202210409349301026' - ] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - def test_cli_8(self): - test_args = [ - '--format', 'txt', - '202210409349301026' - ] - args = self.parser.parse_args(test_args) - run_cli_main(args) - - def test_cli_namespaced(self): - test_args = [ - '--format', 'txt', - '202210409349301026' # tags start with "irs:" - ] - args = self.parser.parse_args(test_args) - run_cli_main(args) - -class TestCommandLine_Index: - - def setUp(self): - parser = get_cli_index_parser() - self.parser = parser - - def test_cli_index_1(self): - args = self.parser.parse_args(['--year', '2017']) - # Does it run? Output is to the 2017 index file. - if DOWNLOAD: - run_cli_index_main(args) - diff --git a/tests/tests.py b/tests/tests.py new file mode 100644 index 0000000..d516ea1 --- /dev/null +++ b/tests/tests.py @@ -0,0 +1,232 @@ +import os + +import pytest + +from irs_reader.file_utils import validate_object_id +from irs_reader.filing import Filing +from irs_reader.irsx_cli import get_parser as get_cli_parser +from irs_reader.irsx_cli import run_main as run_cli_main +from irs_reader.irsx_index_cli import get_cli_index_parser, run_cli_index_main +from irs_reader.object_ids import (object_ids_2015, object_ids_2016, + object_ids_2017) +from irs_reader.settings import WORKING_DIRECTORY +from irs_reader.standardizer import Standardizer +from irs_reader.xmlrunner import XMLRunner + +FILING_2015V21 = "201642229349300909" +FILING_2015V21_skeds = [ + "ReturnHeader990x", + "IRS990", + "IRS990ScheduleA", + "IRS990ScheduleB", + "IRS990ScheduleD", + "IRS990ScheduleM", + "IRS990ScheduleO", +] + +# SUTTER HEALTH SACRAMENTO REGION 2014 filing has multiple schedule K's. +FILING_2014V50 = "201533089349301428" + +FILING_2014V50_skeds = [ + "ReturnHeader990x", + "IRS990", + "IRS990ScheduleA", + "IRS990ScheduleB", + "IRS990ScheduleC", + "IRS990ScheduleD", + "IRS990ScheduleG", + "IRS990ScheduleH", + "IRS990ScheduleI", + "IRS990ScheduleJ", + "IRS990ScheduleK", + "IRS990ScheduleL", + "IRS990ScheduleM", + "IRS990ScheduleO", + "IRS990ScheduleR", +] + +FILING_2022 = "202210409349301026" + +# don't bother testing every filing in tests +TEST_DEPTH = 10 + +# When set to false don't test download files that are already there. +# Runs faster set to off! +DOWNLOAD = False + + +def test_valid_object_id(): + result = validate_object_id(FILING_2022) + + +def test_process_from_id_only(): + a = Filing(FILING_2022) + a.process() + + +def test_process_from_id_only_2(): + a = Filing(FILING_2014V50) + a.process() + assert a.get_version() == "2014v5.0" + + +@pytest.mark.skip(reason="Not sure why this is failing now. Was commented out.") +def test_process_with_filepath(): + filename = "%s_public.xml" % FILING_2015V21 + filepath = os.path.join(WORKING_DIRECTORY, filename) + a = Filing(FILING_2015V21, filepath=filepath) + a.process() + assert a.get_version() == "2015v2.1" + + +# test without runner +class TestConversion: + """Still doesn't validate actual values, but...""" + + def setup_method(self): + self.xml_runner = XMLRunner() + + def test_case_1(self): + self.xml_runner.run_filing(FILING_2022) + + def test_case_2(self): + object_ids = ( + object_ids_2017[:TEST_DEPTH] + + object_ids_2016[:TEST_DEPTH] + + object_ids_2015[:TEST_DEPTH] + ) + for object_id in object_ids: + self.xml_runner.run_filing(object_id) + + +class TestRunner: + """Test using runner class""" + + def setup_method(self): + self.xml_runner = XMLRunner() + + @pytest.mark.skip(reason="Not sure why this is failing now. Was commented out.") + def test1(self): + parsed_filing = self.xml_runner.run_filing(FILING_2022) + assert parsed_filing.get_type() == "IRS990" + parsed_filing_schedules = parsed_filing.list_schedules() + for sked in FILING_2015V21_skeds: + assert sked in parsed_filing_schedules + parsed_filing.get_parsed_sked(sked) + + def test_multiple_sked_ks(self): + parsed_filing = self.xml_runner.run_filing(FILING_2014V50) + assert parsed_filing.get_type() == "IRS990" + parsed_filing_schedules = parsed_filing.list_schedules() + for sked in FILING_2014V50_skeds: + assert sked in parsed_filing_schedules + parsed_filing.get_parsed_sked(sked) + + def test_with_standardizer(self): + standardizer = Standardizer() + self.xml_runner = XMLRunner(standardizer=standardizer) + + +class TestWithDownload: + def setup_method(self): + self.filing = Filing(FILING_2015V21) + if os.path.isfile(self.filing.get_filepath()): + if DOWNLOAD: + os.remove(self.filing.get_filepath()) + + def test_case_1(self): + self.filing.process() + assert self.filing.get_version() == "2015v2.1" + + def test_case_2(self): + self.filing.process() + f_skeds = self.filing.list_schedules() + assert f_skeds == FILING_2015V21_skeds + for f_sked in f_skeds: + self.filing.get_schedule(f_sked) + + +class TestCommandLine: + def setup_method(self): + parser = get_cli_parser() + self.parser = parser + + def test_cli_1(self): + args = self.parser.parse_args([FILING_2022, "--verbose"]) + # Does it run? Output is to std out. + run_cli_main(args) + + def test_cli_2(self): + # dump only main 990 in bare json format + test_args = ["--schedule", "IRS990", "--xpath", "202210409349301026"] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_3(self): + test_args = ["--schedule", "IRS990", FILING_2022] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_4(self): + test_args = [ + "--schedule", + "IRS990", + "--format", + "csv", + "--file", + "testout.csv", + "202210409349301026", + ] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_5(self): + test_args = [ + "--schedule", + "IRS990", + "--format", + "txt", + "--file", + "testout.csv", + "--verbose", + "202210409349301026", + ] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_6(self): + test_args = ["--format", "txt", "202210409349301026"] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_7(self): + test_args = ["--format", "txt", "--xpath", "--verbose", "202210409349301026"] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_8(self): + test_args = ["--list_schedules", "202210409349301026"] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_9(self): + test_args = ["--format", "txt", "202210409349301026"] + args = self.parser.parse_args(test_args) + run_cli_main(args) + + def test_cli_namespaced(self): + test_args = ["--format", "txt", "202210409349301026"] # tags start with "irs:" + args = self.parser.parse_args(test_args) + run_cli_main(args) + + +class TestCommandLine_Index: + def setup_method(self): + parser = get_cli_index_parser() + self.parser = parser + + def test_cli_index_1(self): + args = self.parser.parse_args(["--year", "2017"]) + # Does it run? Output is to the 2017 index file. + if DOWNLOAD: + run_cli_index_main(args) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index ee2e89a..0000000 --- a/tox.ini +++ /dev/null @@ -1,5 +0,0 @@ -[tox] -envlist = py36,py37,py38,py39 -[testenv] -deps=nose -commands=nosetests From 73e64e402b9844666562c77c9fcffba88fbc8ebe Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:17:05 -0400 Subject: [PATCH 16/39] Create python-package.yml --- .github/workflows/python-package.yml | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..7200dcd --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + pip install .[tests] + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest From e4759a7141b679a97679fec6f3ba2641a70d51d3 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:34:55 -0400 Subject: [PATCH 17/39] flake8 --- irs_reader/filing.py | 12 ++---------- irs_reader/irsx_cli.py | 4 ++-- irs_reader/irsx_index_cli.py | 1 - irs_reader/irsx_retrieve_cli.py | 3 +-- irs_reader/keyerror_utils.py | 2 +- irs_reader/settings.py | 5 ++--- irs_reader/sked_dict_reader.py | 4 ++-- irs_reader/standardizer.py | 15 ++------------- irs_reader/text_format_utils.py | 6 +++--- irs_reader/xmlrunner.py | 32 ++------------------------------ setup.py | 7 +------ tests/{tests.py => test_all.py} | 19 +++++++++---------- 12 files changed, 27 insertions(+), 83 deletions(-) rename tests/{tests.py => test_all.py} (91%) diff --git a/irs_reader/filing.py b/irs_reader/filing.py index 918599b..80db1ae 100644 --- a/irs_reader/filing.py +++ b/irs_reader/filing.py @@ -1,22 +1,14 @@ import io import json import os -import sys from collections import OrderedDict from xml.parsers.expat import ExpatError import xmltodict from .file_utils import get_local_path, get_s3_URL, stream_download, validate_object_id -from .settings import IRS_READER_ROOT, KNOWN_SCHEDULES -from .type_utils import ( - dictType, - listType, - noneType, - orderedDictType, - strType, - unicodeType, -) +from .settings import KNOWN_SCHEDULES +from .type_utils import dictType, listType, orderedDictType class InvalidXMLException(Exception): diff --git a/irs_reader/irsx_cli.py b/irs_reader/irsx_cli.py index 6b7a57c..8bb0519 100644 --- a/irs_reader/irsx_cli.py +++ b/irs_reader/irsx_cli.py @@ -1,8 +1,8 @@ import argparse from .filing import Filing -from .settings import IRS_READER_ROOT, KNOWN_SCHEDULES -from .text_format_utils import * +from .settings import KNOWN_SCHEDULES +from .text_format_utils import to_csv, to_json, to_txt from .xmlrunner import XMLRunner diff --git a/irs_reader/irsx_index_cli.py b/irs_reader/irsx_index_cli.py index 2df69a1..6567c2c 100644 --- a/irs_reader/irsx_index_cli.py +++ b/irs_reader/irsx_index_cli.py @@ -1,5 +1,4 @@ import argparse -import sys from datetime import date from .file_utils import get_index_file_URL, get_local_index_path, stream_download diff --git a/irs_reader/irsx_retrieve_cli.py b/irs_reader/irsx_retrieve_cli.py index 27eba50..38b1f7d 100644 --- a/irs_reader/irsx_retrieve_cli.py +++ b/irs_reader/irsx_retrieve_cli.py @@ -1,6 +1,5 @@ import argparse import os -import sys from zipfile import ZipFile from .file_utils import stream_download @@ -79,7 +78,7 @@ def run_cli_retrieve_main(args_read): print( """ Please visit https://www.irs.gov/charities-non-profits/form-990-series-downloads - To see if any additional files are available. + To see if any additional files are available. """ ) for year in args_read.year: diff --git a/irs_reader/keyerror_utils.py b/irs_reader/keyerror_utils.py index 6e419c5..7b5348a 100644 --- a/irs_reader/keyerror_utils.py +++ b/irs_reader/keyerror_utils.py @@ -1,6 +1,6 @@ ignorable_keyerrors = ["/ReturnHeader/BuildTS"] -## Todo: put in 2013 / 2015 series canonicals. +# Todo: put in 2013 / 2015 series canonicals. # 2013 vars that no longer exist discontinued_2013_vars = [ "/IRS990ScheduleA/CertificationInd", diff --git a/irs_reader/settings.py b/irs_reader/settings.py index 08adf35..e12e63e 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -1,5 +1,4 @@ import os -import sys from .dir_utils import mkdir_p @@ -117,7 +116,7 @@ ] # 2020 is experimental -# see https://www.irs.gov/charities-non-profits/ty2020-xml-schemas-and-business-rules-for-exempt-organizations-modernized-e-file +# see https://www.irs.gov/charities-non-profits/ty2020-xml-schemas-and-business-rules-for-exempt-organizations-modernized-e-file # noqa # We can capture the group structure for these so it doesn't break # but these versions ARE NOT supported and aren't mapped to IRSx variables @@ -145,6 +144,6 @@ mkdir_p([WORKING_DIRECTORY, INDEX_DIRECTORY]) try: - from .local_settings import * + from .local_settings import * # noqa except ImportError: pass diff --git a/irs_reader/sked_dict_reader.py b/irs_reader/sked_dict_reader.py index 264fa97..d5e38d3 100644 --- a/irs_reader/sked_dict_reader.py +++ b/irs_reader/sked_dict_reader.py @@ -1,6 +1,5 @@ from .flatten_utils import flatten from .keyerror_utils import ignorable_keyerror -from .settings import LOG_KEY from .type_utils import ( dictType, listType, @@ -228,7 +227,8 @@ def _parse_json(self, json_node, parent_path=""): pass elif this_node_type == strType: - msg = "String '%s'" % json_node + pass + # msg = "String '%s'" % json_node # self.logging.debug(msg) else: raise Exception("Unhandled type: %s" % (type(json_node))) diff --git a/irs_reader/standardizer.py b/irs_reader/standardizer.py index 9de0b9c..535fc31 100644 --- a/irs_reader/standardizer.py +++ b/irs_reader/standardizer.py @@ -1,18 +1,7 @@ -import collections +import csv import os -import sys -# import logging -from datetime import datetime - -from .settings import KEYERROR_LOG, METADATA_DIRECTORY -from .sked_dict_reader import SkedDictReader -from .type_utils import listType - -if sys.version_info >= (3, 0): - import csv -else: - import unicodecsv as csv +from .settings import METADATA_DIRECTORY class Standardizer(object): diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 6c22a1f..4543352 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -6,7 +6,7 @@ import unicodecsv -from .standardizer import Documentizer, Standardizer, VersionDocumentizer +from .standardizer import VersionDocumentizer BRACKET_RE = re.compile(r"\[.*?\]") @@ -114,7 +114,7 @@ def to_txt( for result in results: for this_result in result["csv_line_array"]: - #### Collect the variables we need + # Collect the variables we need vardata = None textoutput = "\n" # This is what we'll eventually write out this_result["form"] = this_result["xpath"].split("/")[1] @@ -137,7 +137,7 @@ def to_txt( ) this_result["description"] = debracket(raw_description) - #### Write the output, now that we've got the vars + # Write the output, now that we've got the vars if this_sked_name != this_result["form"]: textoutput += ( diff --git a/irs_reader/xmlrunner.py b/irs_reader/xmlrunner.py index e25eea5..88ccae7 100644 --- a/irs_reader/xmlrunner.py +++ b/irs_reader/xmlrunner.py @@ -1,11 +1,7 @@ from .filing import Filing -from .settings import ( - ALLOWED_VERSIONSTRINGS, - CSV_ALLOWED_VERSIONSTRINGS, - WORKING_DIRECTORY, -) +from .settings import ALLOWED_VERSIONSTRINGS, CSV_ALLOWED_VERSIONSTRINGS from .sked_dict_reader import SkedDictReader -from .standardizer import Documentizer, Standardizer, VersionDocumentizer +from .standardizer import Documentizer, Standardizer # from .log_utils import configure_logging from .type_utils import listType @@ -149,30 +145,6 @@ def run_filing(self, object_id, verbose=False): print("Filing version %s isn't supported for this operation" % this_version) return this_filing - """ - def run_from_filing_obj(self, this_filing, verbose=False): - - #Run from a pre-created filing object. - - self.whole_filing_data = [] - self.filing_keyerr_data = [] - this_filing.process(verbose=verbose) - object_id = this_filing.get_object_id() - this_version = this_filing.get_version() - if this_version in ALLOWED_VERSIONSTRINGS: - this_version = this_filing.get_version() - schedules = this_filing.list_schedules() - ein = this_filing.get_ein() - for sked in schedules: - sked_dict = this_filing.get_schedule(sked) - self._run_schedule(sked, object_id, sked_dict, ein) - this_filing.set_result(self.whole_filing_data) - this_filing.set_keyerrors(self.filing_keyerr_data) - return this_filing - else: - return this_filing - """ - def run_sked(self, object_id, sked, verbose=False): """ sked is the proper name of the schedule: diff --git a/setup.py b/setup.py index 1cba3bb..11410fc 100644 --- a/setup.py +++ b/setup.py @@ -23,12 +23,7 @@ install_requires=["requests", "xmltodict", "unicodecsv"], extras_require={ "tests": [ - "nose", - "requests", - "xmltodict", - "unicodecsv", - "tox", - "tox-pyenv", + "pytest", ] }, packages=["irsx"], diff --git a/tests/tests.py b/tests/test_all.py similarity index 91% rename from tests/tests.py rename to tests/test_all.py index d516ea1..f6fd45f 100644 --- a/tests/tests.py +++ b/tests/test_all.py @@ -2,16 +2,15 @@ import pytest -from irs_reader.file_utils import validate_object_id -from irs_reader.filing import Filing -from irs_reader.irsx_cli import get_parser as get_cli_parser -from irs_reader.irsx_cli import run_main as run_cli_main -from irs_reader.irsx_index_cli import get_cli_index_parser, run_cli_index_main -from irs_reader.object_ids import (object_ids_2015, object_ids_2016, - object_ids_2017) -from irs_reader.settings import WORKING_DIRECTORY -from irs_reader.standardizer import Standardizer -from irs_reader.xmlrunner import XMLRunner +from irsx.file_utils import validate_object_id +from irsx.filing import Filing +from irsx.irsx_cli import get_parser as get_cli_parser +from irsx.irsx_cli import run_main as run_cli_main +from irsx.irsx_index_cli import get_cli_index_parser, run_cli_index_main +from irsx.object_ids import object_ids_2015, object_ids_2016, object_ids_2017 +from irsx.settings import WORKING_DIRECTORY +from irsx.standardizer import Standardizer +from irsx.xmlrunner import XMLRunner FILING_2015V21 = "201642229349300909" FILING_2015V21_skeds = [ From 77cfcef58e2a553fc42bc502c9d22c018cd0ba49 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:37:19 -0400 Subject: [PATCH 18/39] gh action checkout get submodules --- .github/workflows/python-package.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7200dcd..2dc48e6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -20,6 +20,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + submodules: true - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: From dc65ddab22007813e3ac22fba77a1d3421196739 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:37:55 -0400 Subject: [PATCH 19/39] flake8 config --- .flake8 | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..0bc799f --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +[flake8] +exclude = + venv, + **/migrations/* +# So flake8 plays nicely with black +# https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html +max-line-length = 120 +extend-ignore = E203 \ No newline at end of file From 6b9c80d9d7deae14b96d79022417540fc365c84c Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:41:44 -0400 Subject: [PATCH 20/39] black and isort checks --- .github/workflows/python-package.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 2dc48e6..f637127 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -29,7 +29,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest + python -m pip install flake8 pytest black isort pip install .[tests] - name: Lint with flake8 run: | @@ -37,6 +37,10 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Check black + run: black --check + - name: isort + run: isort --check-only - name: Test with pytest run: | pytest From a7e71f05829200d522d03b03135091bca53300b6 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:43:36 -0400 Subject: [PATCH 21/39] black and isort checks --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index f637127..0054d61 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -38,9 +38,9 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Check black - run: black --check + run: black --check irs_reader tests - name: isort - run: isort --check-only + run: isort --check-only irs_reader tests - name: Test with pytest run: | pytest From 32852511de1eb068fd457dccd39f3c526d43b560 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:44:30 -0400 Subject: [PATCH 22/39] black and isort checks --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 0054d61..5b4290e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -40,7 +40,7 @@ jobs: - name: Check black run: black --check irs_reader tests - name: isort - run: isort --check-only irs_reader tests + run: isort --profile=black --check-only irs_reader tests - name: Test with pytest run: | pytest From 48c9db7edf55c48b252ff43503661f64439390e7 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:46:26 -0400 Subject: [PATCH 23/39] fix imports --- tests/test_all.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_all.py b/tests/test_all.py index f6fd45f..7ab18b4 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -1,7 +1,6 @@ import os import pytest - from irsx.file_utils import validate_object_id from irsx.filing import Filing from irsx.irsx_cli import get_parser as get_cli_parser From 277f3c95159a45062aee66c61a0cf9bf3ba91b30 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:48:37 -0400 Subject: [PATCH 24/39] update testing instructions --- README.md | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e7ced4a..11f1ddd 100644 --- a/README.md +++ b/README.md @@ -477,20 +477,16 @@ You can still add command line args, like this: ## Testing -Nosetests - Test coverage is incomplete, improve it with coverage.py; run 'pip install coverage' -then: - - $ nosetests --with-coverage --cover-erase --cover-package=irs_reader - -or - - $ coverage report -m - - - -Tox -- see tox.ini; testing for: 2.7,3.4,3.5,3.6. You may need to run `pip install tox` in the testing environment. +Install dependencies +```console +> pip install .[tests] +``` +And run tests +```console +> pytest +``` ## Acknowledgements From 2495bbd954ccfaa1da3352c806b8f4ff2d2b3858 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Wed, 12 Jun 2024 23:55:01 -0400 Subject: [PATCH 25/39] Update README.md --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 11f1ddd..5f340d3 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,5 @@ # IRSx -Update: 12/16. The IRS has announced it will no longer post xml 990 filings to AWS, thereby undermining irsx' ability to automatically retrieve filings. The IRS does appear to make the raw filings available in [bulk format on this page](https://www.irs.gov/charities-non-profits/form-990-series-downloads). It is possible to use IRSx by retrieving the files and placing them at the location that IRSX expects to find them. We are seeking additional information from IRS and plan to address this soon. - - ## Table of Contents - [Installation](#installation) From d0be7fd76818610bb5d8c47f298eb7401d8c6d49 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:12:35 -0400 Subject: [PATCH 26/39] use environmental variables instead of local_settings.py --- README.md | 18 ++++-------------- irs_reader/local_settings-example.py | 16 ---------------- irs_reader/settings.py | 26 ++++++++++++++++++-------- setup.py | 2 +- 4 files changed, 23 insertions(+), 39 deletions(-) delete mode 100644 irs_reader/local_settings-example.py diff --git a/README.md b/README.md index 11f1ddd..b86927c 100644 --- a/README.md +++ b/README.md @@ -289,21 +289,11 @@ For example: ### Legacy configuration ### -You also can configure IRSx's cache location by setting the local_settings.py file. To figure out where that settings file is, log in to a terminal and type: - - >>> from irsx.settings import IRSX_SETTINGS_LOCATION - >>> IRSX_SETTINGS_LOCATION - '/long/path/to/lib/python3.6/site-packages/irsx/settings.py' - -[ If you get an error, try upgrading irsx with `pip install irsx --upgrade` -- this feature was added in 0.1.1. ] - - -Go to that directory. You can either modify the settings.py file or the local_settings.py file. To do the latter, first `cd` into the directory where the settings files live and run: - - $ cp local_settings.py-example local_settings.py - -Then edit local_settings.py to set WORKING\_DIRECTORY to where the raw xml files are found. +You also can configure IRSx's cache location by setting an environmntal variable. +```console +> export IRSX_CACHE_DIRECTORY=/where/you/like +``` ## IRSx from python diff --git a/irs_reader/local_settings-example.py b/irs_reader/local_settings-example.py deleted file mode 100644 index c3d02bb..0000000 --- a/irs_reader/local_settings-example.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - -from .dir_utils import mkdir_p - -IRS_READER_ROOT = "/path/to/irsreader/990-xml-reader" - -# This is the URL to amazon's bucket, could use another synced to it -IRS_XML_HTTP_BASE = "https://s3.amazonaws.com/irs-form-990" - -# The directory we put files in while we're processing them -WORKING_DIRECTORY = os.path.join(IRS_READER_ROOT, "XML") - -# Helpful to keep these around for lookup purposes -INDEX_DIRECTORY = os.path.join(IRS_READER_ROOT, "CSV") - -mkdir_p([WORKING_DIRECTORY, INDEX_DIRECTORY]) diff --git a/irs_reader/settings.py b/irs_reader/settings.py index e12e63e..59d54cd 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -1,26 +1,36 @@ import os +import environ + from .dir_utils import mkdir_p -IRS_READER_ROOT = os.path.abspath(os.path.dirname(__file__)) +env = environ.Env() + -# This is the URL to amazon's bucket, could use another synced to it -IRS_XML_HTTP_BASE = "https://gt990datalake-rawdata.s3.amazonaws.com/EfileData/XmlFiles" +IRS_READER_ROOT = env( + "IRS_READER_ROOT", default=os.path.abspath(os.path.dirname(__file__)) +) + +# This is the URL to Giving Tuesday's bucket, could use another synced to it +IRS_XML_HTTP_BASE = env( + "IRS_XML_HTTP_BASE", + default="https://gt990datalake-rawdata.s3.amazonaws.com/EfileData/XmlFiles", +) # It can be hard to locate this. IRSX_SETTINGS_LOCATION = os.path.join(IRS_READER_ROOT, "settings.py") # Defaults to the same directory as this settings file, but you can override # with the `IRSX_CACHE_DIRECTORY` environment variable -IRSX_CACHE_DIRECTORY = os.environ.get("IRSX_CACHE_DIRECTORY", IRS_READER_ROOT) +IRSX_CACHE_DIRECTORY = env("IRSX_CACHE_DIRECTORY", default=IRS_READER_ROOT) # The directory we put files in while we're processing them -WORKING_DIRECTORY = os.environ.get( - "IRSX_WORKING_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "XML") +WORKING_DIRECTORY = env( + "IRSX_WORKING_DIRECTORY", default=os.path.join(IRSX_CACHE_DIRECTORY, "XML") ) # Helpful to keep these around for lookup purposes -INDEX_DIRECTORY = os.environ.get( - "IRSX_INDEX_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "CSV") +INDEX_DIRECTORY = env( + "IRSX_INDEX_DIRECTORY", default=os.path.join(IRSX_CACHE_DIRECTORY, "CSV") ) IRS_INDEX_BASE = "https://apps.irs.gov/pub/epostcard/990/xml/%s/index_%s.csv" diff --git a/setup.py b/setup.py index 11410fc..5b1d891 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setup_requires=[ "setuptools", ], - install_requires=["requests", "xmltodict", "unicodecsv"], + install_requires=["requests", "xmltodict", "django-environ"], extras_require={ "tests": [ "pytest", From 1a40d8babd211ec7222dae05ffcb18360eea737f Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:26:46 -0400 Subject: [PATCH 27/39] Update text_format_utils.py --- irs_reader/text_format_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 4543352..75291df 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -4,8 +4,6 @@ import re import sys -import unicodecsv - from .standardizer import VersionDocumentizer BRACKET_RE = re.compile(r"\[.*?\]") @@ -67,7 +65,7 @@ def to_csv( "group_name", "group_index", ] - writer = unicodecsv.DictWriter( + writer = csv.DictWriter( stdout, fieldnames=fieldnames, encoding="utf-8", quoting=csv.QUOTE_MINIMAL ) writer.writeheader() # this fails in python3? From 231637dac5035d36636ab190cc0cce63900142a4 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:28:42 -0400 Subject: [PATCH 28/39] Update text_format_utils.py --- irs_reader/text_format_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 75291df..4e2f8c6 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -66,7 +66,7 @@ def to_csv( "group_index", ] writer = csv.DictWriter( - stdout, fieldnames=fieldnames, encoding="utf-8", quoting=csv.QUOTE_MINIMAL + stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL ) writer.writeheader() # this fails in python3? results = parsed_filing.get_result() From 0a88421414c970357e4baaf088fa4a1cf233e6e4 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:31:54 -0400 Subject: [PATCH 29/39] blacken --- irs_reader/text_format_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 4e2f8c6..2e395cb 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -65,9 +65,7 @@ def to_csv( "group_name", "group_index", ] - writer = csv.DictWriter( - stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL - ) + writer = csv.DictWriter(stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) writer.writeheader() # this fails in python3? results = parsed_filing.get_result() From 8361ba50bbf5f76a958344653e2de3b08ac534da Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:37:13 -0400 Subject: [PATCH 30/39] csv --- irs_reader/text_format_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 2e395cb..d091e3b 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -48,11 +48,12 @@ def to_csv( ): if not vd: vd = VersionDocumentizer() - stdout = getattr(sys.stdout, "buffer", sys.stdout) + if outfilepath: - stdout = open(outfilepath, "wb") # or 'wb' ? + out_file = open(outfilepath, "w") + else: + out_file = sys.stdout - fieldnames = [] fieldnames = [ "object_id", "form", @@ -65,7 +66,7 @@ def to_csv( "group_name", "group_index", ] - writer = csv.DictWriter(stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) + writer = csv.DictWriter(out_file, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) writer.writeheader() # this fails in python3? results = parsed_filing.get_result() From 7c7b5b2a160362cad6bee9cce585fedcba870599 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 09:31:14 -0400 Subject: [PATCH 31/39] add build steps --- .github/workflows/python-package.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 5b4290e..2df5e83 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,6 +1,3 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - name: Python package on: @@ -8,6 +5,7 @@ on: branches: [ "main" ] pull_request: branches: [ "main" ] + release: jobs: build: @@ -44,3 +42,13 @@ jobs: - name: Test with pytest run: | pytest + - name: Build distribution + if: ${{ github.event_name == 'release' }} + run: | + pip install build + python -m build + - name: Upload source distribution + if: ${{ github.event_name == 'release' }} + uses: softprops/action-gh-release@v2 + with: + files: dist/* From 5f28011842958581e4bb3f94166f9b19902b3cdc Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:12:35 -0400 Subject: [PATCH 32/39] use environmental variables instead of local_settings.py --- README.md | 18 ++++-------------- irs_reader/local_settings-example.py | 16 ---------------- irs_reader/settings.py | 26 ++++++++++++++++++-------- setup.py | 2 +- 4 files changed, 23 insertions(+), 39 deletions(-) delete mode 100644 irs_reader/local_settings-example.py diff --git a/README.md b/README.md index 5f340d3..9989814 100644 --- a/README.md +++ b/README.md @@ -286,21 +286,11 @@ For example: ### Legacy configuration ### -You also can configure IRSx's cache location by setting the local_settings.py file. To figure out where that settings file is, log in to a terminal and type: - - >>> from irsx.settings import IRSX_SETTINGS_LOCATION - >>> IRSX_SETTINGS_LOCATION - '/long/path/to/lib/python3.6/site-packages/irsx/settings.py' - -[ If you get an error, try upgrading irsx with `pip install irsx --upgrade` -- this feature was added in 0.1.1. ] - - -Go to that directory. You can either modify the settings.py file or the local_settings.py file. To do the latter, first `cd` into the directory where the settings files live and run: - - $ cp local_settings.py-example local_settings.py - -Then edit local_settings.py to set WORKING\_DIRECTORY to where the raw xml files are found. +You also can configure IRSx's cache location by setting an environmntal variable. +```console +> export IRSX_CACHE_DIRECTORY=/where/you/like +``` ## IRSx from python diff --git a/irs_reader/local_settings-example.py b/irs_reader/local_settings-example.py deleted file mode 100644 index c3d02bb..0000000 --- a/irs_reader/local_settings-example.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - -from .dir_utils import mkdir_p - -IRS_READER_ROOT = "/path/to/irsreader/990-xml-reader" - -# This is the URL to amazon's bucket, could use another synced to it -IRS_XML_HTTP_BASE = "https://s3.amazonaws.com/irs-form-990" - -# The directory we put files in while we're processing them -WORKING_DIRECTORY = os.path.join(IRS_READER_ROOT, "XML") - -# Helpful to keep these around for lookup purposes -INDEX_DIRECTORY = os.path.join(IRS_READER_ROOT, "CSV") - -mkdir_p([WORKING_DIRECTORY, INDEX_DIRECTORY]) diff --git a/irs_reader/settings.py b/irs_reader/settings.py index e12e63e..59d54cd 100644 --- a/irs_reader/settings.py +++ b/irs_reader/settings.py @@ -1,26 +1,36 @@ import os +import environ + from .dir_utils import mkdir_p -IRS_READER_ROOT = os.path.abspath(os.path.dirname(__file__)) +env = environ.Env() + -# This is the URL to amazon's bucket, could use another synced to it -IRS_XML_HTTP_BASE = "https://gt990datalake-rawdata.s3.amazonaws.com/EfileData/XmlFiles" +IRS_READER_ROOT = env( + "IRS_READER_ROOT", default=os.path.abspath(os.path.dirname(__file__)) +) + +# This is the URL to Giving Tuesday's bucket, could use another synced to it +IRS_XML_HTTP_BASE = env( + "IRS_XML_HTTP_BASE", + default="https://gt990datalake-rawdata.s3.amazonaws.com/EfileData/XmlFiles", +) # It can be hard to locate this. IRSX_SETTINGS_LOCATION = os.path.join(IRS_READER_ROOT, "settings.py") # Defaults to the same directory as this settings file, but you can override # with the `IRSX_CACHE_DIRECTORY` environment variable -IRSX_CACHE_DIRECTORY = os.environ.get("IRSX_CACHE_DIRECTORY", IRS_READER_ROOT) +IRSX_CACHE_DIRECTORY = env("IRSX_CACHE_DIRECTORY", default=IRS_READER_ROOT) # The directory we put files in while we're processing them -WORKING_DIRECTORY = os.environ.get( - "IRSX_WORKING_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "XML") +WORKING_DIRECTORY = env( + "IRSX_WORKING_DIRECTORY", default=os.path.join(IRSX_CACHE_DIRECTORY, "XML") ) # Helpful to keep these around for lookup purposes -INDEX_DIRECTORY = os.environ.get( - "IRSX_INDEX_DIRECTORY", os.path.join(IRSX_CACHE_DIRECTORY, "CSV") +INDEX_DIRECTORY = env( + "IRSX_INDEX_DIRECTORY", default=os.path.join(IRSX_CACHE_DIRECTORY, "CSV") ) IRS_INDEX_BASE = "https://apps.irs.gov/pub/epostcard/990/xml/%s/index_%s.csv" diff --git a/setup.py b/setup.py index 11410fc..5b1d891 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setup_requires=[ "setuptools", ], - install_requires=["requests", "xmltodict", "unicodecsv"], + install_requires=["requests", "xmltodict", "django-environ"], extras_require={ "tests": [ "pytest", From ae899af0fe0c2feb42793f39a24a52416413d522 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:26:46 -0400 Subject: [PATCH 33/39] Update text_format_utils.py --- irs_reader/text_format_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 4543352..75291df 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -4,8 +4,6 @@ import re import sys -import unicodecsv - from .standardizer import VersionDocumentizer BRACKET_RE = re.compile(r"\[.*?\]") @@ -67,7 +65,7 @@ def to_csv( "group_name", "group_index", ] - writer = unicodecsv.DictWriter( + writer = csv.DictWriter( stdout, fieldnames=fieldnames, encoding="utf-8", quoting=csv.QUOTE_MINIMAL ) writer.writeheader() # this fails in python3? From 586938304bbc401bfbe9e8976c6be75a2308fd03 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:28:42 -0400 Subject: [PATCH 34/39] Update text_format_utils.py --- irs_reader/text_format_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 75291df..4e2f8c6 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -66,7 +66,7 @@ def to_csv( "group_index", ] writer = csv.DictWriter( - stdout, fieldnames=fieldnames, encoding="utf-8", quoting=csv.QUOTE_MINIMAL + stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL ) writer.writeheader() # this fails in python3? results = parsed_filing.get_result() From 5ad69cc811dcae7aea0395b8ac8f0f792defb293 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:31:54 -0400 Subject: [PATCH 35/39] blacken --- irs_reader/text_format_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 4e2f8c6..2e395cb 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -65,9 +65,7 @@ def to_csv( "group_name", "group_index", ] - writer = csv.DictWriter( - stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL - ) + writer = csv.DictWriter(stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) writer.writeheader() # this fails in python3? results = parsed_filing.get_result() From 769c902f2b44246ca3c0ebfefcd798af6bfd4b51 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 00:37:13 -0400 Subject: [PATCH 36/39] csv --- irs_reader/text_format_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/irs_reader/text_format_utils.py b/irs_reader/text_format_utils.py index 2e395cb..d091e3b 100644 --- a/irs_reader/text_format_utils.py +++ b/irs_reader/text_format_utils.py @@ -48,11 +48,12 @@ def to_csv( ): if not vd: vd = VersionDocumentizer() - stdout = getattr(sys.stdout, "buffer", sys.stdout) + if outfilepath: - stdout = open(outfilepath, "wb") # or 'wb' ? + out_file = open(outfilepath, "w") + else: + out_file = sys.stdout - fieldnames = [] fieldnames = [ "object_id", "form", @@ -65,7 +66,7 @@ def to_csv( "group_name", "group_index", ] - writer = csv.DictWriter(stdout, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) + writer = csv.DictWriter(out_file, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) writer.writeheader() # this fails in python3? results = parsed_filing.get_result() From 1939a40238718f5ddd6596b60276518507a68122 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 09:31:14 -0400 Subject: [PATCH 37/39] add build steps --- .github/workflows/python-package.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 5b4290e..2df5e83 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,6 +1,3 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - name: Python package on: @@ -8,6 +5,7 @@ on: branches: [ "main" ] pull_request: branches: [ "main" ] + release: jobs: build: @@ -44,3 +42,13 @@ jobs: - name: Test with pytest run: | pytest + - name: Build distribution + if: ${{ github.event_name == 'release' }} + run: | + pip install build + python -m build + - name: Upload source distribution + if: ${{ github.event_name == 'release' }} + uses: softprops/action-gh-release@v2 + with: + files: dist/* From 29ba7c371f5eaef0a8a1ba5a9f64e40c01f2726d Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 09:48:38 -0400 Subject: [PATCH 38/39] slight changes to metadaa --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index edf6f01..1015855 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "irs_reader/metadata"] path = irs_reader/metadata - url = https://github.com/propublica/990-xml-metadata.git + url = https://github.com/datamade/990-xml-metadata.git From d8db8bb47595c61769500699f524bd0651e26e5f Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 13 Jun 2024 09:54:40 -0400 Subject: [PATCH 39/39] update commit --- irs_reader/metadata | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/irs_reader/metadata b/irs_reader/metadata index fbc5b8f..3d5dcf7 160000 --- a/irs_reader/metadata +++ b/irs_reader/metadata @@ -1 +1 @@ -Subproject commit fbc5b8f3f7baa2c611ed653d5429c93a8a9aa609 +Subproject commit 3d5dcf7ce120c90f657b1bdce937723a2bf4de20