diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 0000000..c94162d --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,15 @@ +--- +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + checks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v6 + - uses: pre-commit/action@v3.0.1 diff --git a/.gitignore b/.gitignore index 89462a9..7b70675 100644 --- a/.gitignore +++ b/.gitignore @@ -141,4 +141,3 @@ checklink/cookies.txt # .gitconfig is now autogenerated .gitconfig - diff --git a/.jupyter/desktop-settings.json b/.jupyter/desktop-settings.json index bc90999..7a6cce6 100644 --- a/.jupyter/desktop-settings.json +++ b/.jupyter/desktop-settings.json @@ -1,3 +1,3 @@ { "pythonPath": "/home/kyle/anaconda3/envs/tigris/bin/python" -} \ No newline at end of file +} diff --git a/.jupyter/desktop-workspaces/default-37a8.jupyterlab-workspace b/.jupyter/desktop-workspaces/default-37a8.jupyterlab-workspace index 335789a..e0ae818 100644 --- a/.jupyter/desktop-workspaces/default-37a8.jupyterlab-workspace +++ b/.jupyter/desktop-workspaces/default-37a8.jupyterlab-workspace @@ -1 +1 @@ -{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":1,"widgets":["notebook:docs/03-data-utilities.ipynb","notebook:Untitled.ipynb"]},"current":"notebook:Untitled.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.17988593974175035,0.8201140602582496,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:docs/03-data-utilities.ipynb":{"data":{"path":"docs/03-data-utilities.ipynb","factory":"Notebook"}},"notebook:Untitled.ipynb":{"data":{"path":"Untitled.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}} \ No newline at end of file +{"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":1,"widgets":["notebook:docs/03-data-utilities.ipynb","notebook:Untitled.ipynb"]},"current":"notebook:Untitled.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.17988593974175035,0.8201140602582496,0]},"file-browser-filebrowser:cwd":{"path":""},"notebook:docs/03-data-utilities.ipynb":{"data":{"path":"docs/03-data-utilities.ipynb","factory":"Notebook"}},"notebook:Untitled.ipynb":{"data":{"path":"Untitled.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f3788e0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +--- +ci: + autofix_prs: false + +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.13.3 + hooks: + - id: ruff-check + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: debug-statements + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-added-large-files diff --git a/LICENSE b/LICENSE index e87778c..096ffd6 100644 --- a/LICENSE +++ b/LICENSE @@ -4,4 +4,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index f27a563..3666d8e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ include pygris/internals/*.csv -include requirements.txt \ No newline at end of file +include requirements.txt diff --git a/README.md b/README.md index 760457e..f631590 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ## pygris -__pygris__ is a Python package to help users access US Census Bureau TIGER/Line and cartographic boundary shapefiles and load them into Python as GeoDataFrames. The goal of the package is to make it simple to access US Census geographic data; data can be acquired with a single function for a given geography (e.g. `tracts()` for Census tracts) along with a few options. +__pygris__ is a Python package to help users access US Census Bureau TIGER/Line and cartographic boundary shapefiles and load them into Python as GeoDataFrames. The goal of the package is to make it simple to access US Census geographic data; data can be acquired with a single function for a given geography (e.g. `tracts()` for Census tracts) along with a few options. -The package is a general port of the [R __tigris__ package](https://github.com/walkerke/tigris) with a few modifications. The framework of and philosophy behind the __tigris__ package is covered in [Chapter 5 of my book, _Analyzing US Census Data: Methods, Maps and Models in R_](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html). +The package is a general port of the [R __tigris__ package](https://github.com/walkerke/tigris) with a few modifications. The framework of and philosophy behind the __tigris__ package is covered in [Chapter 5 of my book, _Analyzing US Census Data: Methods, Maps and Models in R_](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html). Install __pygris__ from PyPI with the following command: diff --git a/docs/01-basic-usage.ipynb b/docs/01-basic-usage.ipynb index 51e8960..c074b21 100644 --- a/docs/01-basic-usage.ipynb +++ b/docs/01-basic-usage.ipynb @@ -49,7 +49,7 @@ "source": [ "import pygris\n", "\n", - "ny_roads = pygris.roads(state = \"NY\", county = \"New York\")\n", + "ny_roads = pygris.roads(state=\"NY\", county=\"New York\")\n", "\n", "ny_roads.plot()" ] @@ -100,19 +100,20 @@ } ], "source": [ - "from pygris import counties\n", "import matplotlib.pyplot as plt\n", "\n", + "from pygris import counties\n", + "\n", "# Get the default TIGER/Line file for counties in Michigan\n", - "mi_tiger = counties(state = \"MI\", cache = True)\n", + "mi_tiger = counties(state=\"MI\", cache=True)\n", "\n", "# Get the cartographic boundary file with cb = True\n", - "mi_cartographic = counties(state = \"MI\", cb = True, cache = True)\n", + "mi_cartographic = counties(state=\"MI\", cb=True, cache=True)\n", "\n", "# Plot the two side-by-side to compare them\n", - "fig, ax = plt.subplots(ncols = 2)\n", - "mi_tiger.plot(ax = ax[0])\n", - "mi_cartographic.plot(ax = ax[1])\n", + "fig, ax = plt.subplots(ncols=2)\n", + "mi_tiger.plot(ax=ax[0])\n", + "mi_cartographic.plot(ax=ax[1])\n", "\n", "ax[0].set_title(\"TIGER/Line\")\n", "ax[1].set_title(\"Cartographic\")" diff --git a/docs/02-geometries.ipynb b/docs/02-geometries.ipynb index ca15c53..031b814 100644 --- a/docs/02-geometries.ipynb +++ b/docs/02-geometries.ipynb @@ -49,7 +49,7 @@ "source": [ "from pygris import states\n", "\n", - "us = states(cb = True, resolution = \"20m\")\n", + "us = states(cb=True, resolution=\"20m\")\n", "\n", "us.plot()" ] @@ -133,7 +133,7 @@ } ], "source": [ - "us_shifted = shift_geometry(us, position = \"outside\", preserve_area = True)\n", + "us_shifted = shift_geometry(us, position=\"outside\", preserve_area=True)\n", "\n", "us_shifted.plot()" ] @@ -175,14 +175,16 @@ "import geopandas as gp\n", "from matplotlib import pyplot as plt\n", "\n", - "fbi_offices = gp.read_file(\"https://gist.githubusercontent.com/walkerke/a9211114fcbbe74cb04e3a6720b17d09/raw/085ca1d5dad7b37b45b0bdb5e16c160c4fb0bc37/fbi-offices.geojson\")\n", + "fbi_offices = gp.read_file(\n", + " \"https://gist.githubusercontent.com/walkerke/a9211114fcbbe74cb04e3a6720b17d09/raw/085ca1d5dad7b37b45b0bdb5e16c160c4fb0bc37/fbi-offices.geojson\"\n", + ")\n", "\n", "fbi_rescaled = shift_geometry(fbi_offices)\n", "\n", "fig, ax = plt.subplots()\n", "\n", - "us_rescaled.plot(ax = ax, color = \"grey\")\n", - "fbi_rescaled.plot(ax = ax, color = \"black\")" + "us_rescaled.plot(ax=ax, color=\"grey\")\n", + "fbi_rescaled.plot(ax=ax, color=\"black\")" ] }, { @@ -395,9 +397,9 @@ } ], "source": [ - "from pygris import tracts \n", + "from pygris import tracts\n", "\n", - "king_tracts = tracts(state = \"WA\", county = \"King\", cb = True, cache = True)\n", + "king_tracts = tracts(state=\"WA\", county=\"King\", cb=True, cache=True)\n", "\n", "king_tracts.explore()" ] @@ -614,7 +616,7 @@ "source": [ "from pygris.utils import erase_water\n", "\n", - "king_tiger = tracts(\"WA\", \"King\", cb = False, cache = True)\n", + "king_tiger = tracts(\"WA\", \"King\", cb=False, cache=True)\n", "\n", "king_erased = erase_water(king_tiger)\n", "\n", diff --git a/docs/03-data-utilities.ipynb b/docs/03-data-utilities.ipynb index a40b15b..5ad97e6 100644 --- a/docs/03-data-utilities.ipynb +++ b/docs/03-data-utilities.ipynb @@ -27,16 +27,13 @@ "source": [ "from pygris.data import get_census\n", "\n", - "us_youth_sahie = get_census(dataset = \"timeseries/healthins/sahie\",\n", - " variables = \"PCTUI_PT\",\n", - " params = {\n", - " \"for\": \"county:*\",\n", - " \"in\": \"state:*\",\n", - " \"time\": 2019,\n", - " \"AGECAT\": 4\n", - " }, \n", - " return_geoid = True, \n", - " guess_dtypes = True)" + "us_youth_sahie = get_census(\n", + " dataset=\"timeseries/healthins/sahie\",\n", + " variables=\"PCTUI_PT\",\n", + " params={\"for\": \"county:*\", \"in\": \"state:*\", \"time\": 2019, \"AGECAT\": 4},\n", + " return_geoid=True,\n", + " guess_dtypes=True,\n", + ")" ] }, { @@ -167,22 +164,19 @@ } ], "source": [ - "from pygris import counties \n", - "from pygris.utils import shift_geometry\n", "from matplotlib import pyplot as plt\n", "\n", - "us_counties = counties(cb = True, resolution = \"20m\", cache = True, year = 2019)\n", + "from pygris import counties\n", + "from pygris.utils import shift_geometry\n", + "\n", + "us_counties = counties(cb=True, resolution=\"20m\", cache=True, year=2019)\n", "us_counties_rescaled = shift_geometry(us_counties)\n", "\n", - "us_counties_merged = us_counties_rescaled.merge(us_youth_sahie, on = \"GEOID\")\n", + "us_counties_merged = us_counties_rescaled.merge(us_youth_sahie, on=\"GEOID\")\n", "\n", - "us_counties_merged.plot(\n", - " column = \"PCTUI_PT\",\n", - " cmap = \"viridis\",\n", - " figsize = (8, 6)\n", - ")\n", + "us_counties_merged.plot(column=\"PCTUI_PT\", cmap=\"viridis\", figsize=(8, 6))\n", "\n", - "plt.title(\"% uninsured under age 19 by county, 2019\")\n" + "plt.title(\"% uninsured under age 19 by county, 2019\")" ] }, { @@ -353,7 +347,7 @@ "source": [ "from pygris.data import get_lodes\n", "\n", - "tx_od = get_lodes(state = \"TX\", year = 2022, lodes_type = \"od\", cache = True)\n", + "tx_od = get_lodes(state=\"TX\", year=2022, lodes_type=\"od\", cache=True)\n", "\n", "tx_od.head()" ] @@ -523,8 +517,14 @@ } ], "source": [ - "tx_od_tract = get_lodes(state = \"TX\", year = 2022, lodes_type = \"od\", cache = True,\n", - " return_geometry = True, agg_level = \"tract\")\n", + "tx_od_tract = get_lodes(\n", + " state=\"TX\",\n", + " year=2022,\n", + " lodes_type=\"od\",\n", + " cache=True,\n", + " return_geometry=True,\n", + " agg_level=\"tract\",\n", + ")\n", "\n", "tx_od_tract.head()" ] @@ -865,9 +865,12 @@ "source": [ "tcu_ids = [\"48439104203\", \"48439104301\"]\n", "\n", - "tcu_origins = tx_od_tract.loc[(tx_od_tract['w_geocode'].isin(tcu_ids)) & (tx_od_tract['h_geocode'].str.slice(stop = 5) == \"48439\")]\n", + "tcu_origins = tx_od_tract.loc[\n", + " (tx_od_tract[\"w_geocode\"].isin(tcu_ids))\n", + " & (tx_od_tract[\"h_geocode\"].str.slice(stop=5) == \"48439\")\n", + "]\n", "\n", - "tcu_origins.explore(column = \"S000\")" + "tcu_origins.explore(column=\"S000\")" ] }, { diff --git a/docs/04-geocoding.ipynb b/docs/04-geocoding.ipynb index 38da6a1..b3f40ca 100644 --- a/docs/04-geocoding.ipynb +++ b/docs/04-geocoding.ipynb @@ -80,7 +80,7 @@ "source": [ "from pygris.geocode import geocode\n", "\n", - "geocode(address = \"1600 Pennsylvania Ave NW, Washington DC\")" + "geocode(address=\"1600 Pennsylvania Ave NW, Washington DC\")" ] }, { @@ -282,7 +282,9 @@ } ], "source": [ - "geocode(address = \"1600 Pennsylvania Ave NW, Washington DC\", as_gdf = True).explore(marker_type = \"marker\")" + "geocode(address=\"1600 Pennsylvania Ave NW, Washington DC\", as_gdf=True).explore(\n", + " marker_type=\"marker\"\n", + ")" ] }, { @@ -349,7 +351,7 @@ "source": [ "from pygris.geocode import geolookup\n", "\n", - "geolookup(longitude = -98.90629, latitude= 32.75639)" + "geolookup(longitude=-98.90629, latitude=32.75639)" ] }, { @@ -437,14 +439,17 @@ ], "source": [ "import pandas as pd\n", + "\n", "from pygris.geocode import batch_geocode\n", "\n", "my_addresses = pd.DataFrame(\n", - " {\"building\": [\"Chrysler Building\", \"Empire State Building\", \"Flatiron Building\"],\n", - " \"address\": [\"405 Lexington Ave\", \"20 W 34th St\", \"175 5th Ave\"],\n", - " \"city\": \"New York\",\n", - " \"state\": \"New York\",\n", - " \"zip\": [\"10174\", \"10018\", \"10010\"]}\n", + " {\n", + " \"building\": [\"Chrysler Building\", \"Empire State Building\", \"Flatiron Building\"],\n", + " \"address\": [\"405 Lexington Ave\", \"20 W 34th St\", \"175 5th Ave\"],\n", + " \"city\": \"New York\",\n", + " \"state\": \"New York\",\n", + " \"zip\": [\"10174\", \"10018\", \"10010\"],\n", + " }\n", ")\n", "\n", "my_addresses" @@ -583,9 +588,15 @@ } ], "source": [ - "my_points = batch_geocode(my_addresses, id_column = \"building\",\n", - " address = \"address\", city = \"city\", state = \"state\",\n", - " zip = \"zip\", as_gdf = True)\n", + "my_points = batch_geocode(\n", + " my_addresses,\n", + " id_column=\"building\",\n", + " address=\"address\",\n", + " city=\"city\",\n", + " state=\"state\",\n", + " zip=\"zip\",\n", + " as_gdf=True,\n", + ")\n", "\n", "my_points" ] @@ -789,8 +800,7 @@ } ], "source": [ - "\n", - "my_points.explore(marker_type = \"marker\")" + "my_points.explore(marker_type=\"marker\")" ] }, { @@ -1019,8 +1029,9 @@ "source": [ "from pygris import tracts\n", "\n", - "capitol_tracts = tracts(state = \"TX\", cb = True,\n", - " subset_by = {\"1100 Congress Ave., Austin, TX 78701\": 5000})\n", + "capitol_tracts = tracts(\n", + " state=\"TX\", cb=True, subset_by={\"1100 Congress Ave., Austin, TX 78701\": 5000}\n", + ")\n", "\n", "capitol_tracts.explore()" ] @@ -1240,7 +1251,10 @@ "source": [ "import pandas as pd\n", "\n", - "union_tracts_list = [tracts(cb = True, state = x, subset_by = {\"2501 Seaport Dr, Chester, PA 19013\": 10000}) for x in ['DE', 'PA', 'NJ']]\n", + "union_tracts_list = [\n", + " tracts(cb=True, state=x, subset_by={\"2501 Seaport Dr, Chester, PA 19013\": 10000})\n", + " for x in [\"DE\", \"PA\", \"NJ\"]\n", + "]\n", "\n", "union_tracts = pd.concat(union_tracts_list)\n", "\n", diff --git a/docs/index.md b/docs/index.md index da51288..4ebbd33 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,8 +1,8 @@ # Introduction -__pygris__ is a Python package to help users access US Census Bureau TIGER/Line and cartographic boundary shapefiles and load them into Python as GeoDataFrames. The goal of the package is to make it simple to access US Census geographic data; data can be acquired with a single function for a given geography (e.g. `tracts()` for Census tracts) along with a few options. +__pygris__ is a Python package to help users access US Census Bureau TIGER/Line and cartographic boundary shapefiles and load them into Python as GeoDataFrames. The goal of the package is to make it simple to access US Census geographic data; data can be acquired with a single function for a given geography (e.g. `tracts()` for Census tracts) along with a few options. -The package is a general port of the [R __tigris__ package](https://github.com/walkerke/tigris) with a few modifications. The framework of and philosophy behind the __tigris__ package is covered in [Chapter 5 of my book, _Analyzing US Census Data: Methods, Maps and Models in R_](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html). +The package is a general port of the [R __tigris__ package](https://github.com/walkerke/tigris) with a few modifications. The framework of and philosophy behind the __tigris__ package is covered in [Chapter 5 of my book, _Analyzing US Census Data: Methods, Maps and Models in R_](https://walker-data.com/census-r/census-geographic-data-and-applications-in-r.html). Install __pygris__ from PyPI with the following command: diff --git a/mkdocs.yml b/mkdocs.yml index 2fc53f3..d37a84f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -2,11 +2,11 @@ site_name: pygris site_url: https://walker-data.com/pygris repo_url: https://github.com/walkerke/pygris -theme: +theme: name: material - palette: + palette: primary: black - icon: + icon: repo: fontawesome/brands/github logo: img/tiger_map_small.png favicon: img/tiger_map_small.png diff --git a/pygris/__init__.py b/pygris/__init__.py index 8e59c2c..73c1b9d 100644 --- a/pygris/__init__.py +++ b/pygris/__init__.py @@ -1,10 +1,10 @@ __version__ = "0.2.0" -from .helpers import * from .enumeration_units import * -from .water import * -from .transportation import * -from .native import * +from .helpers import * from .legislative import * +from .metro_areas import * from .national import * -from .metro_areas import * \ No newline at end of file +from .native import * +from .transportation import * +from .water import * diff --git a/pygris/data.py b/pygris/data.py index 9116b16..ccca194 100644 --- a/pygris/data.py +++ b/pygris/data.py @@ -1,86 +1,92 @@ -import requests -import pandas as pd -import numpy as np -import appdirs import os -from pygris.enumeration_units import states, counties, tracts, block_groups, blocks -from pygris.geometry import _get_geometry import warnings +import appdirs +import numpy as np +import pandas as pd +import requests + +from pygris.geometry import _get_geometry + -def get_census(dataset, variables, year = None, params = {}, - return_geoid = False, guess_dtypes = False): +def get_census( + dataset, + variables, + year=None, + params=None, + return_geoid=False, + guess_dtypes=False, +): """ Make a request to a US Census Bureau API endpoint Parameters - -------------- + ---------- dataset : str The dataset name; browse https://api.census.gov/data.html for options. - The name will be the componet of the API URL that follows "data/" or - the year. For example, 1-year ACS data will be "acs/acs1". + The name will be the componet of the API URL that follows "data/" or + the year. For example, 1-year ACS data will be "acs/acs1". variables : str or list - A string (or list of strings) representing the variables requested from the - API. Datasets have 'variables.html' pages that can be viewed to find - variable IDs, e.g. "https://api.census.gov/data/2017/acs/acs1/variables.html". + A string (or list of strings) representing the variables requested from the + API. Datasets have 'variables.html' pages that can be viewed to find + variable IDs, e.g. "https://api.census.gov/data/2017/acs/acs1/variables.html". year : int - The year of the dataset, e.g. 2021. Not all datasets use a year, so leave - blank if so (such as the timeseries APIs). - params : dict - A dict of parameters to send with your API request. This will + The year of the dataset, e.g. 2021. Not all datasets use a year, so leave + blank if so (such as the timeseries APIs). + params : dict, optional + A dict of parameters to send with your API request. This will vary based on the API you are using. You don't need to include - variables in the request, but other optional parameters - will be included here. + variables in the request, but other optional parameters + will be included here. return_geoid : bool - If True, `get_census()` will attempt to assemble a GEOID column - from contextual information in the dataset that is suitable for + If True, `get_census()` will attempt to assemble a GEOID column + from contextual information in the dataset that is suitable for merging to Census shapes acquired with pygris. This won't make sense - / won't work for all datasets, so use this option with caution. - Defaults to False. + / won't work for all datasets, so use this option with caution. + Defaults to False. guess_dtypes : bool - The Census APIs return all columns as strings, but many data - columns should be treated as numbers. If True, `get_census()` + The Census APIs return all columns as strings, but many data + columns should be treated as numbers. If True, `get_census()` will scan the columns and try to guess which columns should be converted to numeric and do so. Users may want to leave this - option False (the default) and convert columns on a - case-by-case basis. + option False (the default) and convert columns on a + case-by-case basis. Returns - ------------- + ------- A Pandas DataFrame of data from the requested US Census dataset. Notes - ------------- - This function is a low-level interface to the Census APIs provided for convenience. For a full-featured, Pythonic - interface to the US Census Bureau APIs, I would recommend using the cenpy package (https://cenpy-devs.github.io/cenpy/index.html) + ----- + This function is a low-level interface to the Census APIs provided for convenience. + For a full-featured, Pythonic interface to the US Census Bureau APIs, I would + recommend using the cenpy package (https://cenpy-devs.github.io/cenpy/index.html) - `get_census()` is inspired by Hannah Recht's work on the censusapi R package (https://www.hrecht.com/censusapi/). + `get_census()` is inspired by Hannah Recht's work on the censusapi R package (https://www.hrecht.com/censusapi/). """ - + params = params or {} endpoint = "https://api.census.gov/data" if type(variables) is not list: variables = [variables] - if year is None: - base = f"{endpoint}/{dataset}" - else: - base = f"{endpoint}/{year}/{dataset}" + base = f"{endpoint}/{dataset}" if year is None else f"{endpoint}/{year}/{dataset}" # get request must be <50, split it and run each chunk (adapted from cenpy) data = [] n_chunks = np.ceil(len(variables) / 50) for chunk in np.array_split(variables, n_chunks): - joined_vars = ",".join(chunk) - params.update({'get': joined_vars}) + params.update({"get": joined_vars}) - req = requests.get(url = base, params = params) + req = requests.get(url=base, params=params) if req.status_code != 200: - raise SyntaxError(f"Request failed. The Census Bureau error message is {req.text}") + raise SyntaxError( + f"Request failed. The Census Bureau error message is {req.text}" + ) df = pd.DataFrame(req.json()[1:], columns=req.json()[0]) @@ -88,10 +94,13 @@ def get_census(dataset, variables, year = None, params = {}, # find the columns that are not in variables my_cols = list(df.columns) - # if 'state' is not in the list of columns, don't assemble the GEOID; too much - # ambiguity among possible combinations across the various endpoints + # if 'state' is not in the list of columns, don't assemble the GEOID; + # too much ambiguity among possible combinations across the various + # endpoints if "state" not in my_cols: - raise ValueError("`return_geoid` is not supported for this geography hierarchy.") + raise ValueError( + "`return_geoid` is not supported for this geography hierarchy." + ) # Identify the position of the state column in my_cols, and # extract all the columns that follow it @@ -100,23 +109,25 @@ def get_census(dataset, variables, year = None, params = {}, geoid_cols = my_cols[state_ix:] # Assemble the GEOID column, then remove its constituent parts - df['GEOID'] = df[geoid_cols].agg("".join, axis = 1) + df["GEOID"] = df[geoid_cols].agg("".join, axis=1) - df.drop(geoid_cols, axis = 1, inplace = True) + df = df.drop(geoid_cols, axis=1) if guess_dtypes: num_list = [] - # Iterate through the columns in variables and try to guess if they should be converted + # Iterate through the columns in variables and try to guess if they should + # be converted for v in chunk: - check = pd.to_numeric(df[v], errors = "coerce") - # If the columns aren't fully null, convert to numeric, taking care of any oddities - if not pd.isnull(check.unique())[0]: + check = pd.to_numeric(df[v], errors="coerce") + # If the columns aren't fully null, convert to numeric, taking care + # of any oddities + if not pd.isna(check.unique())[0]: df[v] = check num_list.append(v) - # If we are guessing numerics, we should convert NAs (negatives below -1 million) - # to NaN. Users who want to keep the codes should keep as object and handle - # themselves. + # If we are guessing numerics, we should convert NAs + # (negatives below -1 million) to NaN. Users who want to keep the codes + # should keep as object and handle themselves. df[num_list] = df[num_list].where(df[num_list] > -999999) data += [df] # Add output from each chunk to list @@ -132,118 +143,132 @@ def get_census(dataset, variables, year = None, params = {}, return out -def get_lodes(state, year, version = "LODES8", lodes_type = "od", part = "main", - job_type = "JT00", segment = "S000", agg_level = "block", cache = False, - return_geometry = False, return_lonlat = False, od_geometry = "home", - cb = True): - +def get_lodes( + state, + year, + version="LODES8", + lodes_type="od", + part="main", + job_type="JT00", + segment="S000", + agg_level="block", + cache=False, + return_geometry=False, + return_lonlat=False, + od_geometry="home", + cb=True, +): """ - Get synthetic block-level data on workplace, residence, and origin-destination flows characteristics from the - LEHD Origin-Destination Employment Statistics (LODES) dataset + Get synthetic block-level data on workplace, residence, and origin-destination + flows characteristics from the LEHD Origin-Destination Employment Statistics + (LODES) dataset Parameters - -------------- + ---------- state : str - The state postal code of your requested data. Please note that not all states are available - in all years. + The state postal code of your requested data. Please note that not all states + are available in all years. year : int - The year of your requested data. LODES data go back to 2002, but not all datasets are available - for all years / for all states. + The year of your requested data. LODES data go back to 2002, but not all + datasets are available for all years / for all states. version : str - The LODES version to use. Version 8 (the default, use "LODES8") is enumerated at 2020 Census blocks. - "LODES7" is enumerated at 2010 Census blocks, but ends in 2019; "LODES5" is enumerated at 2000 Census - blocks, but ends in 2009. + The LODES version to use. Version 8 (the default, use "LODES8") is enumerated + at 2020 Census blocks. "LODES7" is enumerated at 2010 Census blocks, but ends + in 2019; "LODES5" is enumerated at 2000 Census blocks, but ends in 2009. lodes_type : str - One of "od" (the default) for origin-destination flows, "wac" for workplace area characteristics, - or "rac" for residence area characteristics. + One of "od" (the default) for origin-destination flows, "wac" for workplace + area characteristics, or "rac" for residence area characteristics. part : str - Only relevant for the "od" file. "main" gives information on within-state residence to workplace flows. - "aux" gives information for residence to workplace flows from outside a given state. + Only relevant for the "od" file. "main" gives information on within-state + residence to workplace flows. "aux" gives information for residence to + workplace flows from outside a given state. job_type : str - The available job type breakdown; defaults to "JT00" for all jobs. Please review the LODES technical - documentation for a description of other options. - segment : str - The workforce segment, relevant when lodes_type is "wac" or "rac". Defaults to "S000" for total jobs; + The available job type breakdown; defaults to "JT00" for all jobs. Please review the LODES technical documentation for a description of other options. + segment : str + The workforce segment, relevant when lodes_type is "wac" or "rac". Defaults + to "S000" for total jobs; review the LODES technical documentation for a + description of other options. agg_level : str - The level at which to aggregate the data. Defaults to the Census block; other options include - "county", "tract", and "block group". + The level at which to aggregate the data. Defaults to the Census block; other + options include "county", "tract", and "block group". cache : bool - If True, downloads the requested LODES data to a cache directory on your computer and reads from - that directory if the file exists. Defaults to False, which will download the data by default. + If True, downloads the requested LODES data to a cache directory on your + computer and reads from that directory if the file exists. Defaults to False, + which will download the data by default. return_geometry : bool - If True, get_lodes() will fetch the corresponding polygon geometry for shapes and return a GeoPandas - GeoDataFrame. Defaults to False. + If True, get_lodes() will fetch the corresponding polygon geometry for shapes + and return a GeoPandas GeoDataFrame. Defaults to False. return_lonlat : bool - If True, columns representing the corresponding polygon centroid will be + If True, columns representing the corresponding polygon centroid will be od_geometry : str - Whether to attach residential geometries ("home") or workplace geometries ("work"). Only specified - when lodes_type is "od". Defaults to "home". + Whether to attach residential geometries ("home") or workplace geometries + ("work"). Only specified when lodes_type is "od". Defaults to "home". cb : bool - If retrieving geometry, use the Cartographic Boundary shapefile (True) or the TIGER/Line shapefile (False). - Defaults to True for LODES8 and LODES7, and False for LODES5. + If retrieving geometry, use the Cartographic Boundary shapefile (True) or the + TIGER/Line shapefile (False). Defaults to True for LODES8 and LODES7, and + False for LODES5. Returns - --------------- + ------- A Pandas DataFrame or GeoPandas GeoDataFrame of LODES data. Notes - --------------- - Please review the LODES technical documentation at https://lehd.ces.census.gov/data/lodes/LODES8/LODESTechDoc8.0.pdf for - more information. + ----- + Please review the LODES technical documentation at https://lehd.ces.census.gov/data/lodes/LODES8/LODESTechDoc8.0.pdf + for more information. - `get_lodes()` is inspired by the lehdr R package (https://github.com/jamgreen/lehdr) by - Jamaal Green, Dillon Mahmoudi, and Liming Wang. + `get_lodes()` is inspired by the lehdr R package (https://github.com/jamgreen/lehdr) + by Jamaal Green, Dillon Mahmoudi, and Liming Wang. - - """ - if lodes_type not in ['od', 'wac', 'rac']: + """ + if lodes_type not in ["od", "wac", "rac"]: raise ValueError("lodes_type must be one of 'od', 'rac', or 'wac'.") - + state = state.lower() if lodes_type == "od": url = f"https://lehd.ces.census.gov/data/lodes/{version}/{state}/od/{state}_od_{part}_{job_type}_{year}.csv.gz" else: url = f"https://lehd.ces.census.gov/data/lodes/{version}/{state}/{lodes_type}/{state}_{lodes_type}_{segment}_{job_type}_{year}.csv.gz" - + if not cache: lodes_data = pd.read_csv(url) - + else: cache_dir = appdirs.user_cache_dir("pygris") if not os.path.isdir(cache_dir): - os.mkdir(cache_dir) + os.mkdir(cache_dir) basename = os.path.basename(url) out_file = os.path.join(cache_dir, basename) - + # If the file doesn't exist, you'll need to download it # and write it to the cache directory if not os.path.isfile(out_file): - req = requests.get(url = url) + req = requests.get(url=url) - with open(out_file, 'wb') as fd: + with open(out_file, "wb") as fd: fd.write(req.content) - + # Now, read in the file from the cache directory lodes_data = pd.read_csv(out_file) # Drop the 'createdate' column - lodes_data = lodes_data.drop('createdate', axis = 1) + lodes_data = lodes_data.drop("createdate", axis=1) if lodes_type == "od": - lodes_data['w_geocode'] = lodes_data['w_geocode'].astype(str).str.zfill(15) - lodes_data['h_geocode'] = lodes_data['h_geocode'].astype(str).str.zfill(15) + lodes_data["w_geocode"] = lodes_data["w_geocode"].astype(str).str.zfill(15) + lodes_data["h_geocode"] = lodes_data["h_geocode"].astype(str).str.zfill(15) elif lodes_type == "rac": - lodes_data['h_geocode'] = lodes_data['h_geocode'].astype(str).str.zfill(15) + lodes_data["h_geocode"] = lodes_data["h_geocode"].astype(str).str.zfill(15) else: - lodes_data['w_geocode'] = lodes_data['w_geocode'].astype(str).str.zfill(15) + lodes_data["w_geocode"] = lodes_data["w_geocode"].astype(str).str.zfill(15) # Handle aggregation logic if agg_level != "block": @@ -253,228 +278,253 @@ def get_lodes(state, year, version = "LODES8", lodes_type = "od", part = "main", end = 11 elif agg_level == "block group": end = 12 - else: - raise ValueError("Invalid agg_level; choose one of 'state', 'county', 'tract', or 'block group'.") - + else: + raise ValueError( + "Invalid agg_level; choose one of 'state', 'county', 'tract', " + "or 'block group'." + ) + if lodes_type == "wac": - lodes_data['w_geocode'] = lodes_data['w_geocode'].str.slice(stop = end) + lodes_data["w_geocode"] = lodes_data["w_geocode"].str.slice(stop=end) - lodes_data = lodes_data.groupby('w_geocode').agg("sum") + lodes_data = lodes_data.groupby("w_geocode").agg("sum") elif lodes_type == "rac": - lodes_data['h_geocode'] = lodes_data['h_geocode'].str.slice(stop = end) + lodes_data["h_geocode"] = lodes_data["h_geocode"].str.slice(stop=end) - lodes_data = lodes_data.groupby('h_geocode').agg("sum") + lodes_data = lodes_data.groupby("h_geocode").agg("sum") elif lodes_type == "od": - lodes_data['h_geocode'] = lodes_data['h_geocode'].str.slice(stop = end) - lodes_data['w_geocode'] = lodes_data['w_geocode'].str.slice(stop = end) + lodes_data["h_geocode"] = lodes_data["h_geocode"].str.slice(stop=end) + lodes_data["w_geocode"] = lodes_data["w_geocode"].str.slice(stop=end) - lodes_data = lodes_data.groupby(['h_geocode', 'w_geocode']).agg("sum") + lodes_data = lodes_data.groupby(["h_geocode", "w_geocode"]).agg("sum") lodes_data = lodes_data.reset_index() # Handle geometry requests if return_geometry: - print("Requesting feature geometry.") + print("Requesting feature geometry.") if not cache: ("Use cache = True to speed this up in the future.") - if return_lonlat: - raise ValueError("return_geometry and return_lonlat cannot be used at the same time.") + if return_lonlat: + raise ValueError( + "return_geometry and return_lonlat cannot be used at the same time." + ) if version == "LODES8": year = 2020 - elif version == "LODES7": + elif version == "LODES7": year = 2019 else: year = 2000 cb = False - - if lodes_type == "wac": - geom = _get_geometry(geography = agg_level, state = state, cb = cb, year = year, cache = cache) + geom = _get_geometry( + geography=agg_level, state=state, cb=cb, year=year, cache=cache + ) - geom.columns = ['w_geocode', 'geometry'] + geom.columns = ["w_geocode", "geometry"] - geom_merged = geom.merge(lodes_data, on = "w_geocode") + geom_merged = geom.merge(lodes_data, on="w_geocode") elif lodes_type == "rac": - geom = _get_geometry(geography = agg_level, state = state, cb = cb, year = year, cache = cache) + geom = _get_geometry( + geography=agg_level, state=state, cb=cb, year=year, cache=cache + ) + + geom.columns = ["h_geocode", "geometry"] - geom.columns = ['h_geocode', 'geometry'] + geom_merged = geom.merge(lodes_data, on="h_geocode") - geom_merged = geom.merge(lodes_data, on = "h_geocode") - elif lodes_type == "od": if od_geometry == "home": if part == "main": - geom = _get_geometry(geography = agg_level, state = state, cb = cb, year = year, cache = cache) - geom.columns = ['h_geocode', 'geometry'] - - geom_merged = geom.merge(lodes_data, on = "h_geocode") - else: - aux_states = lodes_data['h_geocode'].str.slice(stop = 2).unique().tolist() - h_geom_list = [_get_geometry(geography = agg_level, state = x, year = year, cb = cb, cache = cache) for x in aux_states] + geom = _get_geometry( + geography=agg_level, state=state, cb=cb, year=year, cache=cache + ) + geom.columns = ["h_geocode", "geometry"] + + geom_merged = geom.merge(lodes_data, on="h_geocode") + else: + aux_states = ( + lodes_data["h_geocode"].str.slice(stop=2).unique().tolist() + ) + h_geom_list = [ + _get_geometry( + geography=agg_level, state=x, year=year, cb=cb, cache=cache + ) + for x in aux_states + ] h_geom = pd.concat(h_geom_list) - h_geom.columns = ['h_geocode', 'geometry'] + h_geom.columns = ["h_geocode", "geometry"] - geom_merged = h_geom.merge(lodes_data, on = "h_geocode") + geom_merged = h_geom.merge(lodes_data, on="h_geocode") elif od_geometry == "work": - geom = _get_geometry(geography = agg_level, state = state, cb = cb, year = year, cache = cache) + geom = _get_geometry( + geography=agg_level, state=state, cb=cb, year=year, cache=cache + ) - geom.columns = ['w_geocode', 'geometry'] + geom.columns = ["w_geocode", "geometry"] - geom_merged = geom.merge(lodes_data, on = "w_geocode") - else: + geom_merged = geom.merge(lodes_data, on="w_geocode") + else: raise ValueError("od_geometry must be one of 'home' or 'work'.") - + return geom_merged - + elif return_lonlat: - warnings.filterwarnings('ignore') - print("Requesting feature geometry to determine longitude and latitude.") + warnings.filterwarnings("ignore") + print("Requesting feature geometry to determine longitude and latitude.") if not cache: ("Use cache = True to speed this up in the future.") if version == "LODES8": year = 2020 - elif version == "LODES7": + elif version == "LODES7": year = 2019 else: year = 2000 cb = False - geom = _get_geometry(geography = agg_level, state = state, cb = cb, year = year, cache = cache) + geom = _get_geometry( + geography=agg_level, state=state, cb=cb, year=year, cache=cache + ) if lodes_type == "wac": - geom.columns = ['w_geocode', 'geometry'] + geom.columns = ["w_geocode", "geometry"] with warnings.catch_warnings(): - geom['w_lon'] = geom.centroid.x - geom['w_lat'] = geom.centroid.y + geom["w_lon"] = geom.centroid.x + geom["w_lat"] = geom.centroid.y + + xy = geom.drop("geometry", axis=1) - xy = geom.drop('geometry', axis = 1) + lodes_merged = lodes_data.merge(xy, on="w_geocode") - lodes_merged = lodes_data.merge(xy, on = 'w_geocode') - elif lodes_type == "rac": - geom.columns = ['h_geocode', 'geometry'] + geom.columns = ["h_geocode", "geometry"] with warnings.catch_warnings(): - geom['h_lon'] = geom.centroid.x - geom['h_lat'] = geom.centroid.y + geom["h_lon"] = geom.centroid.x + geom["h_lat"] = geom.centroid.y + + xy = geom.drop("geometry", axis=1) - xy = geom.drop('geometry', axis = 1) + lodes_merged = lodes_data.merge(xy, on="h_geocode") - lodes_merged = lodes_data.merge(xy, on = 'h_geocode') - elif lodes_type == "od": w_geom = geom.copy() - w_geom.columns = ['w_geocode', 'geometry'] + w_geom.columns = ["w_geocode", "geometry"] with warnings.catch_warnings(): - w_geom['w_lon'] = w_geom.centroid.x - w_geom['w_lat'] = w_geom.centroid.y + w_geom["w_lon"] = w_geom.centroid.x + w_geom["w_lat"] = w_geom.centroid.y - w_xy = w_geom.drop('geometry', axis = 1) + w_xy = w_geom.drop("geometry", axis=1) if part == "main": h_geom = geom.copy() elif part == "aux": - aux_states = lodes_data['h_geocode'].str.slice(stop = 2).unique().tolist() - h_geom_list = [_get_geometry(geography = agg_level, state = x, year = year, cb = cb, cache = cache) for x in aux_states] + aux_states = lodes_data["h_geocode"].str.slice(stop=2).unique().tolist() + h_geom_list = [ + _get_geometry( + geography=agg_level, state=x, year=year, cb=cb, cache=cache + ) + for x in aux_states + ] h_geom = pd.concat(h_geom_list) - h_geom.columns = ['h_geocode', 'geometry'] + h_geom.columns = ["h_geocode", "geometry"] with warnings.catch_warnings(): - h_geom['h_lon'] = h_geom.centroid.x - h_geom['h_lat'] = h_geom.centroid.y + h_geom["h_lon"] = h_geom.centroid.x + h_geom["h_lat"] = h_geom.centroid.y - h_xy = h_geom.drop('geometry', axis = 1) + h_xy = h_geom.drop("geometry", axis=1) + + lodes_merged = lodes_data.merge(h_xy, on="h_geocode").merge( + w_xy, on="w_geocode" + ) - lodes_merged = (lodes_data - .merge(h_xy, on = "h_geocode") - .merge(w_xy, on = "w_geocode") - ) - return lodes_merged else: - return lodes_data - + return lodes_data -def get_xwalk(state, version = "LODES8", cache = False): +def get_xwalk(state, version="LODES8", cache=False): """ - Get a Census block-to-parent geography crosswalk file for a given state and a given Census year (represented) - by a LODES version). + Get a Census block-to-parent geography crosswalk file for a given state and a + given Census year (represented) by a LODES version. Parameters - -------------- + ---------- state : str - The state postal code of your requested data. + The state postal code of your requested data. version : str - The LODES version to use. Version 8 (the default, use "LODES8") is enumerated at 2020 Census blocks. Version 7 - (use "LODES7") is enumerated at 2010 Census blocks. + The LODES version to use. Version 8 (the default, use "LODES8") is enumerated + at 2020 Census blocks. Version 7 (use "LODES7") is enumerated at 2010 Census + blocks. cache : bool - If True, downloads the requested LODES data to a cache directory on your computer and reads from - that directory if the file exists. Defaults to False, which will download the data by default. + If True, downloads the requested LODES data to a cache directory on your + computer and reads from that directory if the file exists. Defaults to False, + which will download the data by default. Returns - --------------- - A Pandas DataFrame representing the correspondence between Census blocks and a variety of parent geograpies - in a given LODES dataset (and in turn a given Census year). + ------- + A Pandas DataFrame representing the correspondence between Census blocks and a + variety of parent geograpies in a given LODES dataset (and in turn a given Census + year). Notes - --------------- - Please review the LODES technical documentation at https://lehd.ces.census.gov/data/lodes/LODES8/LODESTechDoc8.0.pdf for - more information. + ----- + Please review the LODES technical documentation at https://lehd.ces.census.gov/data/lodes/LODES8/LODESTechDoc8.0.pdf + for more information. - `get_xwalk()` is inspired by the lehdr R package (https://github.com/jamgreen/lehdr) by - Jamaal Green, Dillon Mahmoudi, and Liming Wang. + `get_xwalk()` is inspired by the lehdr R package (https://github.com/jamgreen/lehdr) + by Jamaal Green, Dillon Mahmoudi, and Liming Wang. - - """ + """ state = state.lower() - url = f"https://lehd.ces.census.gov/data/lodes/{version}/{state}/{state}_xwalk.csv.gz" + url = ( + f"https://lehd.ces.census.gov/data/lodes/{version}/{state}/{state}_xwalk.csv.gz" + ) if not cache: xwalk_data = pd.read_csv(url, dtype="object") - + else: cache_dir = appdirs.user_cache_dir("pygris") if not os.path.isdir(cache_dir): - os.mkdir(cache_dir) + os.mkdir(cache_dir) basename = os.path.basename(url) out_file = os.path.join(cache_dir, basename) - + # If the file doesn't exist, you'll need to download it # and write it to the cache directory if not os.path.isfile(out_file): - req = requests.get(url = url) + req = requests.get(url=url) - with open(out_file, 'wb') as fd: + with open(out_file, "wb") as fd: fd.write(req.content) - + # Now, read in the file from the cache directory - xwalk_data = pd.read_csv(out_file, dtype = "object") - - xwalk_data = xwalk_data.drop('createdate', axis = 1) + xwalk_data = pd.read_csv(out_file, dtype="object") - xwalk_data['blklatdd'] = xwalk_data['blklatdd'].astype(float) - xwalk_data['blklondd'] = xwalk_data['blklondd'].astype(float) - - return xwalk_data + xwalk_data = xwalk_data.drop("createdate", axis=1) + xwalk_data["blklatdd"] = xwalk_data["blklatdd"].astype(float) + xwalk_data["blklondd"] = xwalk_data["blklondd"].astype(float) + return xwalk_data diff --git a/pygris/enumeration_units.py b/pygris/enumeration_units.py index 0a8fb53..c0a0ee8 100644 --- a/pygris/enumeration_units.py +++ b/pygris/enumeration_units.py @@ -2,68 +2,82 @@ __author__ = "Kyle Walker 2018 and cb is True: - state = 'us' + state = "us" print("Retrieving Census tracts for the entire United States") else: raise ValueError("A state is required for this year/dataset combination.") else: state = validate_state(state) - + if cb is True: if year in [1990, 2000]: suf = str(year)[2:] @@ -184,66 +211,78 @@ def tracts(state = None, county = None, cb = False, year = None, cache = False, else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/TRACT/tl_{year}_{state}_tract.zip" - trcts = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + trcts = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) if county is not None: if type(county) is not list: county = [county] - valid_county = [validate_county(state, x) for x in county] - trcts = trcts.query('COUNTYFP in @valid_county') + valid_county = [validate_county(state, x) for x in county] # noqa: F841 + trcts = trcts.query("COUNTYFP in @valid_county") return trcts -def block_groups(state = None, county = None, cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def block_groups( + state=None, + county=None, + cb=False, + year=None, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a Census block groups shapefile into Python as a GeoDataFrame Parameters ---------- - state : str - The state name, state abbreviation, or two-digit FIPS code of the desired state. - If None, block groups for the entire United States will be downloaded if - available for that year / dataset combination. + state : str + The state name, state abbreviation, or two-digit FIPS code of the desired state. + If None, block groups for the entire United States will be downloaded if + available for that year / dataset combination. county : str - The county name or three-digit FIPS code of the desired county. If None, block groups - for the selected state will be downloaded. - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + The county name or three-digit FIPS code of the desired county. If None, block + groups for the selected state will be downloaded. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int - The year of the TIGER/Line or cartographic boundary shapefile. - cache : bool - If True, the function will download a Census shapefile to a cache directory + year : int + The year of the TIGER/Line or cartographic boundary shapefile. + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. - + that intersect a buffer of a given distance (in meters) around an + input address. + Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of Census block groups. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch10GARM.pdf for more information. - - + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch10GARM.pdf for more + information. + + """ if year is None: print("Using the default year of 2024") @@ -251,13 +290,13 @@ def block_groups(state = None, county = None, cb = False, year = None, cache = F if state is None: if year > 2018 and cb is True: - state = 'us' + state = "us" print("Retrieving Census block groups for the entire United States") else: raise ValueError("A state is required for this year/dataset combination.") else: state = validate_state(state) - + if cb is True: if year in [1990, 2000]: suf = str(year)[2:] @@ -277,63 +316,75 @@ def block_groups(state = None, county = None, cb = False, year = None, cache = F else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/BG/tl_{year}_{state}_bg.zip" - bgs = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + bgs = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) if county is not None: if type(county) is not list: county = [county] - valid_county = [validate_county(state, x) for x in county] - bgs = bgs.query('COUNTYFP in @valid_county') + valid_county = [validate_county(state, x) for x in county] # noqa: F841 + bgs = bgs.query("COUNTYFP in @valid_county") return bgs -def school_districts(state = None, type = "unified", cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def school_districts( + state=None, + type="unified", + cb=False, + year=None, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a school districts shapefile into Python as a GeoDataFrame Parameters ---------- - state : str - The state name, state abbreviation, or two-digit FIPS code of the desired state. - If None, school districts for the entire United States will be downloaded - if available for that year / dataset combination. - type : str - One of "unified", "elementary", or "secondary". - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + state : str + The state name, state abbreviation, or two-digit FIPS code of the desired state. + If None, school districts for the entire United States will be downloaded + if available for that year / dataset combination. + type : str + One of "unified", "elementary", or "secondary". + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int - The year of the TIGER/Line or cartographic boundary shapefile. - cache : bool - If True, the function will download a Census shapefile to a cache directory + year : int + The year of the TIGER/Line or cartographic boundary shapefile. + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of school district boundaries. Notes - ---------- - See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf + for more information. """ @@ -346,10 +397,12 @@ def school_districts(state = None, type = "unified", cb = False, year = None, ca state = "us" print("Retrieving school districts for the entire United States") else: - raise ValueError("A state must be specified for this year/dataset combination.") + raise ValueError( + "A state must be specified for this year/dataset combination." + ) else: state = validate_state(state) - + if type == "unified": type = "unsd" elif type == "elementary": @@ -357,57 +410,65 @@ def school_districts(state = None, type = "unified", cb = False, year = None, ca elif type == "secondary": type = "scsd" else: - raise ValueError("Invalid school district type.\nValid types are 'unified', 'elementary', and 'secondary'.") + raise ValueError( + "Invalid school district type.\nValid types are 'unified', 'elementary', " + "and 'secondary'." + ) if cb is True: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_{state}_{type}_500k.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/{type.upper()}/tl_{year}_{state}_{type}.zip" - - return _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + return _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) -def states(cb = True, resolution = "500k", year = None, cache = False, protocol = "http", timeout = 1800): +def states( + cb=True, resolution="500k", year=None, cache=False, protocol="http", timeout=1800 +): """ Load a states shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - resolution : str - The resolution of the cartographic boundary file; only applies if + resolution : str + The resolution of the cartographic boundary file; only applies if the cb argument is set to True. The default is "500k"; options also - include "5m" (1:5 million) and "20m" (1:20 million) - year : int + include "5m" (1:5 million) and "20m" (1:20 million) + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of states. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch4GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch4GARM.pdf for more + information. """ - if resolution not in ["500k", "5m", "20m"]: - raise ValueError("Invalid value for resolution. Valid values are '500k', '5m', and '20m'.") - + raise ValueError( + "Invalid value for resolution. Valid values are '500k', '5m', and '20m'." + ) + if year is None: print("Using the default year of 2024") year = 2024 - + if cb: if year in [1990, 2000]: suf = str(year)[2:] @@ -427,65 +488,74 @@ def states(cb = True, resolution = "500k", year = None, cache = False, protocol url = f"https://www2.census.gov/geo/tiger/TIGER2010/STATE/{year}/tl_2010_us_state{suf}.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/STATE/tl_{year}_us_state.zip" - - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) -def pumas(state = None, cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) + + +def pumas( + state=None, + cb=False, + year=None, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a public use microdata area (PUMA) shapefile into Python as a GeoDataFrame Parameters ---------- state : str - The state name, state abbreviation, or two-digit FIPS code of the desired state. + The state name, state abbreviation, or two-digit FIPS code of the desired state. If None, PUMAs for the entire United States - will be downloaded if available for that dataset / year combination. - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. - Defaults to False (the regular TIGER/Line file). - year : int + will be downloaded if available for that dataset / year combination. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. + Defaults to False (the regular TIGER/Line file). + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of PUMAs. Notes - ---------- - 2020 PUMAs are available with `year = 2022` and later. PUMAs are not available in the 2020 and 2021 CB files; - use `year = 2019` or earlier to retrieve 2010 PUMAs. - - See https://www.census.gov/programs-surveys/geography/guidance/geo-areas/pumas.html for more information. + ----- + 2020 PUMAs are available with `year = 2022` and later. PUMAs are not available in + the 2020 and 2021 CB files; use `year = 2019` or earlier to retrieve 2010 PUMAs. + + See https://www.census.gov/programs-surveys/geography/guidance/geo-areas/pumas.html + for more information. """ - if year is None: year = 2024 print(f"Using the default year of {year}") - + if state is None: if year == 2019 and cb: state = "us" @@ -494,23 +564,29 @@ def pumas(state = None, cb = False, year = None, cache = False, subset_by = None fips = fips_codes() print("Retrieving PUMAs by state and combining the result") - all_states = [code for code in fips['state_code'].unique().tolist() if code <= "56"] + all_states = [ + code for code in fips["state_code"].unique().tolist() if code <= "56" + ] - all_pumas = pd.concat([pumas(x, year = year, cache = cache, protocol = protocol, timeout = timeout) for x in all_states]) + all_pumas = pd.concat( + [ + pumas(x, year=year, cache=cache, protocol=protocol, timeout=timeout) + for x in all_states + ] + ) return all_pumas else: state = validate_state(state) - - if year > 2021: - suf = "20" - else: - suf = "10" - + suf = "20" if year > 2021 else "10" + if cb: if year in [2020, 2021]: - raise ValueError("Cartographic boundary PUMAs are not yet available for years after 2019. Use the argument `year = 2019` instead to request your data.") + raise ValueError( + "Cartographic boundary PUMAs are not yet available for years after " + "2019. Use the argument `year = 2019` instead to request your data." + ) else: if year == 2013: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/cb_{year}_{state}_puma{suf}_500k.zip" @@ -519,159 +595,189 @@ def pumas(state = None, cb = False, year = None, cache = False, subset_by = None else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/PUMA/tl_{year}_{state}_puma{suf}.zip" - pm = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + pm = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) return pm - -def places(state = None, cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def places( + state=None, + cb=False, + year=None, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a Census-designated places shapefile into Python as a GeoDataFrame Parameters ---------- state : str - The state name, state abbreviation, or two-digit FIPS code of the desired state. + The state name, state abbreviation, or two-digit FIPS code of the desired state. If None (the default), places for the entire United States - will be downloaded if available for that year / dataset combination. - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. - Defaults to False (the regular TIGER/Line file). - year : int + will be downloaded if available for that year / dataset combination. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. + Defaults to False (the regular TIGER/Line file). + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of Census-designated places. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch9GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch9GARM.pdf for more + information. """ - if year is None: year = 2024 print(f"Using the default year of {year}") - + if state is None: if year < 2019: - raise ValueError("Retrieving Census-designated data for the entire US only available for years on or after 2019") + raise ValueError( + "Retrieving Census-designated data for the entire US only available " + "for years on or after 2019" + ) elif not cb: - raise ValueError("Retrieving Census-designated data for the entire US only available when cb is set to True") + raise ValueError( + "Retrieving Census-designated data for the entire US only available " + "when cb is set to True" + ) else: state = "us" print("Retrieving Census-designated places for the entire United States") else: state = validate_state(state) - + if cb: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_{state}_place_500k.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/PLACE/tl_{year}_{state}_place.zip" - return _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) - - -def zctas(cb = False, starts_with = None, year = None, state = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): - + return _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) + + +def zctas( + cb=False, + starts_with=None, + year=None, + state=None, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a zip code tabulation areas (ZCTAs) shapefile into Python as a GeoDataFrame Parameters ---------- cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). starts_with : str or list - A string (or list of strings) representing the beginning characters of the - ZCTAs to be returned by the function. + A string (or list of strings) representing the beginning characters of the + ZCTAs to be returned by the function. year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. state : str - The state name, state abbreviation, or two-digit FIPS code of the desired state. + The state name, state abbreviation, or two-digit FIPS code of the desired state. If None (the default), ZCTAs for the entire United States - will be downloaded if available for that year / dataset combination. - cache : bool - If True, the function will download a Census shapefile to a cache directory + will be downloaded if available for that year / dataset combination. + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of zip code tabulation areas. Notes - ---------- + ----- ZCTAs are approximations of zip codes, which themselves are not formally defined as areas - by the United States Postal Service. In turn, not all zip codes will have corresponding - ZCTAs. For these reasons, ZCTAs are not recommended for spatial analysis and should - be used with appropriate caution. - - See https://www.census.gov/programs-surveys/geography/guidance/geo-areas/zctas.html for more information. + by the United States Postal Service. In turn, not all zip codes will have corresponding + ZCTAs. For these reasons, ZCTAs are not recommended for spatial analysis and should + be used with appropriate caution. - """ + See https://www.census.gov/programs-surveys/geography/guidance/geo-areas/zctas.html + for more information. + """ # noqa: E501 if year is None: year = 2024 print(f"Using the default year of {year}") - + if state is not None and year > 2010: raise ValueError("ZCTAs are only available by state for 2000 and 2010.") - + if state is not None and year == 2010 and cb: raise ValueError("ZCTAs are only available by state for 2010 when cb = FALSE.") - + if year == 1990: - raise ValueError("Zip Code Tabulation Areas are only available beginning with the 2000 Census.") - + raise ValueError( + "Zip Code Tabulation Areas are only available beginning with the " + "2000 Census." + ) + if state is not None: state = validate_state(state) - + if not cache: - Warning("ZCTAs can take several minutes to download.\nTo cache the data and avoid re-downloading in future sessions, use the argument `cache = True`.") - + Warning( + "ZCTAs can take several minutes to download.\nTo cache the data and avoid " + "re-downloading in future sessions, use the argument `cache = True`." + ) if cb: if year == 2000: @@ -680,7 +786,9 @@ def zctas(cb = False, starts_with = None, year = None, state = None, cache = Fal else: url = f"https://www2.census.gov/geo/tiger/PREVGENZ/zt/z500shp/zt{state}_d00_shp.zip" elif year == 2010: - url = "https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_860_00_500k.zip" + url = ( + "https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_860_00_500k.zip" + ) elif year >= 2020: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_zcta520_500k.zip" else: @@ -701,30 +809,41 @@ def zctas(cb = False, starts_with = None, year = None, state = None, cache = Fal url = f"https://www2.census.gov/geo/tiger/TIGER2010/ZCTA5/{year}/tl_2010_{state}_zcta5{suf}.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/ZCTA5/tl_{year}_us_zcta510.zip" - - zcta = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + + zcta = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) if starts_with is not None: cols = zcta.columns.tolist() - zcta_ix = [i for i, j in enumerate(cols) if j.startswith("ZCTA")][0] + zcta_ix = next(i for i, j in enumerate(cols) if j.startswith("ZCTA")) zcta_col = cols[zcta_ix] if type(starts_with) is not list: + expr = "^" + starts_with zcta_sub = zcta.loc[zcta[zcta_col].str.startswith(expr)] else: tmp = ["^" + i for i in starts_with] expr = "|".join(tmp) - zcta_sub = zcta.loc[zcta[zcta_col].str.contains(expr)] + zcta_sub = zcta.loc[zcta[zcta_col].str.contains(expr)] return zcta_sub - + else: return zcta - -def blocks(state, county = None, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): + +def blocks( + state, + county=None, + year=None, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a Census blocks shapefile into Python as a GeoDataFrame @@ -734,41 +853,42 @@ def blocks(state, county = None, year = None, cache = False, subset_by = None, p The state name, state abbreviation, or two-digit FIPS code of the desired state. county : str The county name or three-digit FIPS code of the desired county. If None, blocks - for the selected state will be downloaded. - year : int - The year of the TIGER/Line or cartographic boundary shapefile. - cache : bool - If True, the function will download a Census shapefile to a cache directory + for the selected state will be downloaded. + year : int + The year of the TIGER/Line or cartographic boundary shapefile. + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. + Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of Census blocks. Notes - ---------- - See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf for more information. - - - """ + ----- + See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf + for more information. + + """ if year is None: year = 2024 print(f"Using the default year of {year}") @@ -779,13 +899,17 @@ def blocks(state, county = None, year = None, cache = False, subset_by = None, p state = validate_state(state) if not cache: - Warning("Block shapefiles can take several minutes to download.\nConsider using `cache = True` to store block shapefiles\nin a local cache and avoid future downloads.") + Warning( + "Block shapefiles can take several minutes to download.\nConsider using " + "`cache = True` to store block shapefiles\nin a local cache and avoid " + "future downloads." + ) if year in [2000, 2010]: suf = str(year)[2:] if county is not None: county = validate_county(state, county) - + url = f"https://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/{year}/tl_2010_{state}{county}_tabblock{suf}.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/{year}/tl_2010_{state}_tabblock{suf}.zip" @@ -796,78 +920,88 @@ def blocks(state, county = None, year = None, cache = False, subset_by = None, p else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/TABBLOCK20/tl_{year}_{state}_tabblock20.zip" - blks = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + blks = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) if county is not None and year > 2010: if year > 2019: if type(county) is not list: county = [county] valid_county = [validate_county(state, x) for x in county] - blks = blks.query('COUNTYFP20 in @valid_county') + blks = blks.query("COUNTYFP20 in @valid_county") else: if type(county) is not list: county = [county] - valid_county = [validate_county(state, x) for x in county] - blks = blks.query('COUNTYFP10 in @valid_county') - + + valid_county = [validate_county(state, x) for x in county] # noqa: F841 + blks = blks.query("COUNTYFP10 in @valid_county") + return blks -def county_subdivisions(state, county = None, cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def county_subdivisions( + state, + county=None, + cb=False, + year=None, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a county subdivisions shapefile into Python as a GeoDataFrame Parameters ---------- - state : str - The state name, state abbreviation, or two-digit FIPS code of the desired state. - If None, county subdivisions for the entire United States will be downloaded - if available for that year / dataset combination. + state : str + The state name, state abbreviation, or two-digit FIPS code of the desired state. + If None, county subdivisions for the entire United States will be downloaded + if available for that year / dataset combination. county : str - The county name or three-digit FIPS code of the desired county. If None, county subdivisions - for the selected state will be downloaded. - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + The county name or three-digit FIPS code of the desired county. If None, county + subdivisions for the selected state will be downloaded. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int - The year of the TIGER/Line or cartographic boundary shapefile. - cache : bool - If True, the function will download a Census shapefile to a cache directory + year : int + The year of the TIGER/Line or cartographic boundary shapefile. + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of county subdivisions. - Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch8GARM.pdf for more information. - - - """ + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch8GARM.pdf for more + information. + """ if year is None: year = 2024 print(f"Using the default year of {year}") - + state = validate_state(state) if cb: @@ -882,15 +1016,15 @@ def county_subdivisions(state, county = None, cb = False, year = None, cache = F url = f"https://www2.census.gov/geo/tiger/TIGER2010/COUSUB/2010/tl_2010_{state}_cousub10.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/COUSUB/tl_{year}_{state}_cousub.zip" - - cs = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + + cs = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) if county is not None: if type(county) is not list: county = [county] - valid_county = [validate_county(state, x) for x in county] - cs = cs.query('COUNTYFP in @valid_county') - - return cs - + valid_county = [validate_county(state, x) for x in county] # noqa: F841 + cs = cs.query("COUNTYFP in @valid_county") + return cs diff --git a/pygris/geocode.py b/pygris/geocode.py index 75c843b..9534234 100644 --- a/pygris/geocode.py +++ b/pygris/geocode.py @@ -1,25 +1,30 @@ -import requests -import pandas as pd +import csv import json +from io import StringIO + import geopandas as gp import numpy as np -from io import StringIO -import csv +import pandas as pd +import requests def _parse_geographies(response_obj, geography, keep_geo_cols, type): - # Walk through the response object + # Walk through the response object # first, grab appropriate geography data if type == "geocode": - geo_data = pd.json_normalize(response_obj['result'], ['addressMatches', 'geographies', geography]) + geo_data = pd.json_normalize( + response_obj["result"], ["addressMatches", "geographies", geography] + ) if not keep_geo_cols: - geo_data = geo_data.filter(['GEOID']) + geo_data = geo_data.filter(["GEOID"]) # Next, get the coordinates - coords = pd.json_normalize(response_obj['result'], 'addressMatches').filter(['coordinates.x', 'coordinates.y']) + coords = pd.json_normalize(response_obj["result"], "addressMatches").filter( + ["coordinates.x", "coordinates.y"] + ) - coords.columns = ['longitude', 'latitude'] + coords.columns = ["longitude", "latitude"] # Combine the two frames out = coords.join(geo_data) @@ -27,29 +32,38 @@ def _parse_geographies(response_obj, geography, keep_geo_cols, type): return out else: - geo_data = pd.json_normalize(response_obj['result'], ['geographies', geography]) + geo_data = pd.json_normalize(response_obj["result"], ["geographies", geography]) if not keep_geo_cols: - geo_data = geo_data.filter(['GEOID']) - + geo_data = geo_data.filter(["GEOID"]) + return geo_data - -def geocode(address = None, street = None, city = None, state = None, zip = None, - benchmark = "Public_AR_Current", - vintage = "Census2020_Current", as_gdf = False, - geography = "Census Blocks", limit = 1, - keep_geo_cols = False, return_dict = False): + +def geocode( + address=None, + street=None, + city=None, + state=None, + zip=None, + benchmark="Public_AR_Current", + vintage="Census2020_Current", + as_gdf=False, + geography="Census Blocks", + limit=1, + keep_geo_cols=False, + return_dict=False, +): """ Use the Census geocoder to return XY coordinates and Census geography for an input address in the United States. Parameters - --------------- + ---------- address : str A single-line address to be geocoded, e.g. "1600 Pennsylvania Ave, Washington DC 20500" street : str - The street address component, e.g. "1600 Pennsylvania Ave", if breaking out the address into + The street address component, e.g. "1600 Pennsylvania Ave", if breaking out the address into multiple arguments. city : str The city address component, e.g. "Washington" @@ -58,41 +72,41 @@ def geocode(address = None, street = None, city = None, state = None, zip = None zip : str The zip code address component, e.g. "20500" benchmark : str - The geocoding benchmark to use. Defaults to "Public_AR_Current"; other options are - outlined at https://geocoding.geo.census.gov/geocoder/benchmarks. + The geocoding benchmark to use. Defaults to "Public_AR_Current"; other options are + outlined at https://geocoding.geo.census.gov/geocoder/benchmarks. vintage : str The geocoding vintage to use. Defaults to "Census2020_Current" to return 2020 Census - geographies. Vintages available for a given benchmark can be looked up at - https://geocoding.geo.census.gov/geocoder/vintages?benchmark={benchmark_id}, - where benchmark_id is replaced with the benchmark ID. + geographies. Vintages available for a given benchmark can be looked up at + https://geocoding.geo.census.gov/geocoder/vintages?benchmark={benchmark_id}, + where benchmark_id is replaced with the benchmark ID. as_gdf : bool - If False (the default), returns a regular Pandas DataFrame of results. + If False (the default), returns a regular Pandas DataFrame of results. If True, converts the DataFrame into a GeoDataFrame of points. geography : str - The Census geography to return; defaults to 'Census Blocks'. + The Census geography to return; defaults to 'Census Blocks'. limit : int How many records to return (as the geocoder can sometimes return multiple matches). Defaults to 1. keep_geo_cols : bool The Census geocoder can return a wide range of contextual information about - a location with its response. If True, return all of these columns + a location with its response. If True, return all of these columns (default False) return_dict : bool - Advanced users may want to keep the general structure of the Census - geocoder response as a dict without having pygris parse the response. + Advanced users may want to keep the general structure of the Census + geocoder response as a dict without having pygris parse the response. If so, use True (default False). - + Returns - --------------- + ------- A DataFrame (or GeoDataFrame) representing the geocoded address. Notes - --------------- - See https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.html for more information. - - """ + ----- + See https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.html + for more information. + """ # noqa: E501 if address is not None: url = "https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress" elif street is not None: @@ -100,132 +114,164 @@ def geocode(address = None, street = None, city = None, state = None, zip = None else: raise ValueError("Either a single-line address or street must be specified.") - req = requests.get(url = url, - params = {"address": address, - "street": street, - "city": city, - "state": state, - "zip": zip, - "benchmark": benchmark, - "vintage": vintage, - "format": "json"}) + req = requests.get( + url=url, + params={ + "address": address, + "street": street, + "city": city, + "state": state, + "zip": zip, + "benchmark": benchmark, + "vintage": vintage, + "format": "json", + }, + ) if req.status_code != 200: raise SyntaxError(f"Your request failed. The error message is {req.text}") - + r = json.loads(req.text) if return_dict: return r else: - output = _parse_geographies(response_obj = r, geography = geography, keep_geo_cols = keep_geo_cols, - type = "geocode") + output = _parse_geographies( + response_obj=r, + geography=geography, + keep_geo_cols=keep_geo_cols, + type="geocode", + ) if address is not None: - output['address'] = address + output["address"] = address elif street is not None: - output['street'] = street - output['city'] = city - output['state'] = state - output['zip'] = zip + output["street"] = street + output["city"] = city + output["state"] = state + output["zip"] = zip output = output.iloc[0:limit] if as_gdf: - output = gp.GeoDataFrame(data = output, crs = 4326, geometry = gp.points_from_xy(x = output.longitude, y = output.latitude)) - - return output + output = gp.GeoDataFrame( + data=output, + crs=4326, + geometry=gp.points_from_xy(x=output.longitude, y=output.latitude), + ) + return output -def geolookup(longitude, latitude, - benchmark = "Public_AR_Current", - vintage = "Census2020_Current", - geography = "Census Blocks", limit = 1, - keep_geo_cols = False, return_dict = False): +def geolookup( + longitude, + latitude, + benchmark="Public_AR_Current", + vintage="Census2020_Current", + geography="Census Blocks", + limit=1, + keep_geo_cols=False, + return_dict=False, +): """ Use the Census GeoLookup service to return Census geography for an XY coordinate pair in the United States. Parameters - --------------- + ---------- longitude : float The X (longitude) coordinate of your requested location. latitude : float The Y (latitude) coordinate of your requested location. benchmark : str - The geocoding benchmark to use. Defaults to "Public_AR_Current"; other options are - outlined at https://geocoding.geo.census.gov/geocoder/benchmarks. + The geocoding benchmark to use. Defaults to "Public_AR_Current"; other options are + outlined at https://geocoding.geo.census.gov/geocoder/benchmarks. vintage : str The geocoding vintage to use. Defaults to "Census2020_Current" to return 2020 Census - geographies. Vintages available for a given benchmark can be looked up at - https://geocoding.geo.census.gov/geocoder/vintages?benchmark={benchmark_id}, - where benchmark_id is replaced with the benchmark ID. + geographies. Vintages available for a given benchmark can be looked up at + https://geocoding.geo.census.gov/geocoder/vintages?benchmark={benchmark_id}, + where benchmark_id is replaced with the benchmark ID. geography : str - The Census geography to return; defaults to 'Census Blocks'. + The Census geography to return; defaults to 'Census Blocks'. limit : int How many records to return (as the geocoder can sometimes return multiple matches). Defaults to 1. keep_geo_cols : bool The Census geocoder can return a wide range of contextual information about - a location with its response. If True, return all of these columns + a location with its response. If True, return all of these columns (default False) return_dict : bool - Advanced users may want to keep the general structure of the Census - geocoder response as a dict without having pygris parse the response. + Advanced users may want to keep the general structure of the Census + geocoder response as a dict without having pygris parse the response. If so, use True (default False). - - Returns - --------------- - A DataFrame representing the location with contextual information from the Census Bureau. + Returns + ------- + A DataFrame representing the location with contextual information from the Census + Bureau. Notes - --------------- - See https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.html for more information. - - """ + ----- + See https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.html + for more information. + """ # noqa: E501 url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates" - req = requests.get(url = url, - params = {"x": longitude, - "y": latitude, - "benchmark": benchmark, - "vintage": vintage, - "format": "json"}) - + req = requests.get( + url=url, + params={ + "x": longitude, + "y": latitude, + "benchmark": benchmark, + "vintage": vintage, + "format": "json", + }, + ) + if req.status_code != 200: raise SyntaxError(f"Your request failed. The error message is {req.text}") - + r = json.loads(req.text) if return_dict: return r else: - output = _parse_geographies(response_obj = r, geography = geography, keep_geo_cols = keep_geo_cols, - type = "geolookup") + output = _parse_geographies( + response_obj=r, + geography=geography, + keep_geo_cols=keep_geo_cols, + type="geolookup", + ) - output['longitude'] = longitude - output['latitude'] = latitude + output["longitude"] = longitude + output["latitude"] = latitude output = output.iloc[0:limit] return output -def batch_geocode(df, address, city = None, state = None, zip = None, - id_column = None, benchmark = "Public_AR_Current", - vintage = "Census2020_Current", as_gdf = False): +def batch_geocode( + df, + address, + city=None, + state=None, + zip=None, + id_column=None, + benchmark="Public_AR_Current", + vintage="Census2020_Current", + as_gdf=False, +): """ Use the Census batch geocoder to geocode a DataFrame of addresses in the Unied States. Parameters - --------------- + ---------- df : pandas.DataFrame - A Pandas DataFrame containing addresses to be geocoded. Address components should be + A Pandas DataFrame containing addresses to be geocoded. Address components should be split across columns, meaning that separate columns are required for street address, - city, state, and zip code. + city, state, and zip code. street : str The name of the street address column, e.g. "address" city : str @@ -235,89 +281,105 @@ def batch_geocode(df, address, city = None, state = None, zip = None, zip : str The name of the zip code column, e.g. "zip" benchmark : str - The geocoding benchmark to use. Defaults to "Public_AR_Current"; other options are - outlined at https://geocoding.geo.census.gov/geocoder/benchmarks. + The geocoding benchmark to use. Defaults to "Public_AR_Current"; other options are + outlined at https://geocoding.geo.census.gov/geocoder/benchmarks. vintage : str The geocoding vintage to use. Defaults to "Census2020_Current" to return 2020 Census - geographies. Vintages available for a given benchmark can be looked up at - https://geocoding.geo.census.gov/geocoder/vintages?benchmark={benchmark_id}, - where benchmark_id is replaced with the benchmark ID. + geographies. Vintages available for a given benchmark can be looked up at + https://geocoding.geo.census.gov/geocoder/vintages?benchmark={benchmark_id}, + where benchmark_id is replaced with the benchmark ID. as_gdf : bool - If False (the default), returns a regular Pandas DataFrame of results. + If False (the default), returns a regular Pandas DataFrame of results. If True, converts the DataFrame into a GeoDataFrame of points. - + Returns - --------------- + ------- A DataFrame (or GeoDataFrame) representing the geocoded addresses. Notes - --------------- - See https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.html for more information. - - """ + ----- + See https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.html + for more information. + """ # noqa: E501 # Check to make sure the dataset doesn't exceed 10k rows if df.shape[0] > 10000: - raise ValueError("The row limit for the Census batch geocoder is 10,000. Consider splitting your request into multiple requests and retry.") - + raise ValueError( + "The row limit for the Census batch geocoder is 10,000. " + "Consider splitting your request into multiple requests and retry." + ) + # Prep the df object for sending to the geocoder if id_column is None: request_df = pd.DataFrame( - {"Unique ID": range(0, df.shape[0]), - "Street address": df[address]} + {"Unique ID": range(0, df.shape[0]), "Street address": df[address]} ) else: request_df = pd.DataFrame( - {"Unique ID": df[id_column], - "Street address": df[address]} - ) + {"Unique ID": df[id_column], "Street address": df[address]} + ) if city is None: request_df["City"] = np.nan else: request_df["City"] = df[city] - + if state is None: request_df["State"] = np.nan else: request_df["State"] = df[state] - + if zip is None: request_df["ZIP"] = np.nan else: request_df["ZIP"] = df[zip] # Store the df as a CSV - request_csv = request_df.to_csv(index = False, header = False) + request_csv = request_df.to_csv(index=False, header=False) # Formulate the request req = requests.post( - url = "https://geocoding.geo.census.gov/geocoder/geographies/addressbatch", - files = {"addressFile": ('request.csv', request_csv)}, - data = { - "benchmark": benchmark, - "vintage": vintage - } + url="https://geocoding.geo.census.gov/geocoder/geographies/addressbatch", + files={"addressFile": ("request.csv", request_csv)}, + data={"benchmark": benchmark, "vintage": vintage}, ) if req.status_code != 200: raise SyntaxError(f"Your request failed. The error message is {req.text}") - output = pd.read_csv(StringIO(req.text), sep = ",", header = None, quoting = csv.QUOTE_ALL) + output = pd.read_csv( + StringIO(req.text), sep=",", header=None, quoting=csv.QUOTE_ALL + ) # name the columns appropriately - output.columns = ['id', 'address', 'status', 'match_quality', 'matched_address', 'coordinates', 'tiger_line_id', 'tiger_side', - 'state', 'county', 'tract', 'block'] + output.columns = [ + "id", + "address", + "status", + "match_quality", + "matched_address", + "coordinates", + "tiger_line_id", + "tiger_side", + "state", + "county", + "tract", + "block", + ] # split longitude/latitude - output = output.join(output['coordinates'].str.split(',', expand = True).rename(columns = {0: 'longitude', 1: 'latitude'})).drop('coordinates', axis = 1) + output = output.join( + output["coordinates"] + .str.split(",", expand=True) + .rename(columns={0: "longitude", 1: "latitude"}) + ).drop("coordinates", axis=1) if as_gdf: - output = gp.GeoDataFrame(data = output, crs = 4326, geometry = gp.points_from_xy(x = output.longitude, y = output.latitude)) + output = gp.GeoDataFrame( + data=output, + crs=4326, + geometry=gp.points_from_xy(x=output.longitude, y=output.latitude), + ) return output - - - - diff --git a/pygris/geometry.py b/pygris/geometry.py index e45f333..f5f52db 100644 --- a/pygris/geometry.py +++ b/pygris/geometry.py @@ -1,19 +1,18 @@ -from pygris.enumeration_units import counties, tracts, block_groups, blocks +from pygris.enumeration_units import block_groups, blocks, counties, tracts # Helper function to get geometry (LODES-only for now) def _get_geometry(geography, state, year, cb, cache): if geography == "county": - geo = counties(cb = cb, state = state, year = year, cache = cache) + geo = counties(cb=cb, state=state, year=year, cache=cache) elif geography == "tract": - geo = tracts(cb = cb, state = state, year = year, cache = cache) + geo = tracts(cb=cb, state=state, year=year, cache=cache) elif geography == "block group": - geo = block_groups(cb = cb, state = state, year = year, cache = cache) + geo = block_groups(cb=cb, state=state, year=year, cache=cache) elif geography == "block": - geo = blocks(state = state, year = year, cache = cache) - geo = geo.rename({"GEOID20": "GEOID"}, axis = 1) - + geo = blocks(state=state, year=year, cache=cache) + geo = geo.rename({"GEOID20": "GEOID"}, axis=1) - geo_sub = geo.filter(['GEOID', 'geometry']) + geo_sub = geo.filter(["GEOID", "geometry"]) - return geo_sub \ No newline at end of file + return geo_sub diff --git a/pygris/helpers.py b/pygris/helpers.py index dc4e2cc..691e788 100644 --- a/pygris/helpers.py +++ b/pygris/helpers.py @@ -1,20 +1,23 @@ -import requests -import geopandas as gp +import contextlib +import ftplib import os +import re # noqa: F401 import tempfile +from urllib.parse import urlparse + import appdirs +import geopandas as gp import pandas as pd -import re -import ftplib -import zipfile -from urllib.parse import urlparse -from pygris.internal_data import fips_path +import requests + from pygris.geocode import geocode +from pygris.internal_data import fips_path + def _load_tiger(url, cache=False, subset_by=None, protocol="http", timeout=1800): """ Helper function to load census TIGER/Line shapefiles. - + Parameters ---------- url : str @@ -27,7 +30,7 @@ def _load_tiger(url, cache=False, subset_by=None, protocol="http", timeout=1800) Protocol to use for downloading files. Options are "http" (default) or "ftp". timeout : int Timeout in seconds for download operations. Defaults to 300 (5 minutes). - + Returns ------- geopandas.GeoDataFrame @@ -35,11 +38,11 @@ def _load_tiger(url, cache=False, subset_by=None, protocol="http", timeout=1800) """ # Store original URL before protocol modification original_url = url - + # Modify URL for FTP if requested if protocol == "ftp" and url.startswith("https://www2"): url = url.replace("https://www2", "ftp://ftp2") - + # Parse the subset_by argument to figure out what it should represent if subset_by is not None: if type(subset_by) is tuple: @@ -52,12 +55,12 @@ def _load_tiger(url, cache=False, subset_by=None, protocol="http", timeout=1800) buffers = [] for i, j in subset_by.items(): g = geocode(address=i, as_gdf=True, limit=1) - g_buffer = g.to_crs('ESRI:102010').buffer(distance=j) + g_buffer = g.to_crs("ESRI:102010").buffer(distance=j) buffers.append(g_buffer) - + buffer_gdf = pd.concat(buffers) sub = {"mask": buffer_gdf} - + # Determine where to save the file if cache: cache_dir = appdirs.user_cache_dir("pygris") @@ -70,17 +73,17 @@ def _load_tiger(url, cache=False, subset_by=None, protocol="http", timeout=1800) tmp_dir = tempfile.gettempdir() basename = os.path.basename(url) file_path = os.path.join(tmp_dir, basename) - + # Download the file if it doesn't exist (or if not using cache) download_needed = not os.path.isfile(file_path) if cache else True - + if download_needed: download_success = False - + # Parse the URL to determine protocol and path parsed_url = urlparse(url) - is_ftp = parsed_url.scheme == 'ftp' - + is_ftp = parsed_url.scheme == "ftp" + # Try with the primary protocol try: if is_ftp: @@ -88,94 +91,101 @@ def _load_tiger(url, cache=False, subset_by=None, protocol="http", timeout=1800) ftp_host = parsed_url.netloc ftp_path = os.path.dirname(parsed_url.path) filename = os.path.basename(parsed_url.path) - + print(f"Downloading {filename} from Census FTP...") ftp = ftplib.FTP(ftp_host) ftp.login() # anonymous login ftp.cwd(ftp_path) - - with open(file_path, 'wb') as f: - ftp.retrbinary(f'RETR {filename}', f.write) - + + with open(file_path, "wb") as f: + ftp.retrbinary(f"RETR {filename}", f.write) + ftp.quit() download_success = True else: # Handle HTTP download with requests req = requests.get(url=url, timeout=timeout) req.raise_for_status() # Raise an exception for HTTP errors - - with open(file_path, 'wb') as fd: + + with open(file_path, "wb") as fd: fd.write(req.content) download_success = True - - except Exception as e: + + except Exception: # If HTTP fails and we're using HTTP, try FTP as fallback if protocol == "http" and not is_ftp: print("HTTP download failed, trying FTP as fallback...") ftp_url = original_url.replace("https://www2", "ftp://ftp2") - + try: # Parse the FTP URL parsed_ftp = urlparse(ftp_url) ftp_host = parsed_ftp.netloc ftp_path = os.path.dirname(parsed_ftp.path) filename = os.path.basename(parsed_ftp.path) - + # Connect to FTP and download print(f"Downloading {filename} from Census FTP...") ftp = ftplib.FTP(ftp_host) ftp.login() # anonymous login ftp.cwd(ftp_path) - - with open(file_path, 'wb') as f: - ftp.retrbinary(f'RETR {filename}', f.write) - + + with open(file_path, "wb") as f: + ftp.retrbinary(f"RETR {filename}", f.write) + ftp.quit() download_success = True - - except Exception as e2: + + except Exception: download_success = False - + # If both HTTP and FTP failed, raise an error if not download_success: raise ValueError( - "Download failed with both HTTP and FTP; check your internet connection or the status of the Census Bureau website " + "Download failed with both HTTP and FTP; check your internet " + "connection or the status of the Census Bureau website " "at https://www2.census.gov/geo/tiger/ or ftp://ftp2.census.gov/geo/tiger/." ) - + # Read the file from the local filesystem try: if subset_by is not None: tiger_data = gp.read_file(file_path, **sub) else: tiger_data = gp.read_file(file_path) - + # Clean up temporary file if not caching if not cache and os.path.exists(file_path): - try: + with contextlib.suppress(Exception): os.remove(file_path) - except: - pass # Ignore errors in cleanup - + return tiger_data - + except Exception as e: # If the file is corrupted, remove it and try downloading again if cache and os.path.exists(file_path): try: os.remove(file_path) print("Cached file may be corrupted. Downloading again...") - return _load_tiger(original_url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout) - except: + return _load_tiger( + original_url, + cache=cache, + subset_by=subset_by, + protocol=protocol, + timeout=timeout, + ) + except Exception: pass raise e + def fips_codes(): path = fips_path() - return pd.read_csv(path, dtype = 'object') + return pd.read_csv(path, dtype="object") + -def validate_state(state, quiet = False): +def validate_state(state, quiet=False): # Standardize as lowercase original_input = state if isinstance(state, str): @@ -190,8 +200,8 @@ def validate_state(state, quiet = False): # If the FIPS code is supplied as an int elif isinstance(state, int): - #convert to string - state=str(state) + # convert to string + state = str(state) # Left-pad if necessary state = state.zfill(2) # Return the result @@ -201,73 +211,77 @@ def validate_state(state, quiet = False): fips = fips_codes() # If a state abbreviation, use the state postal code if len(state) == 2: - fips['postal_lower'] = fips.state.str.lower() - state_sub = fips.query('postal_lower == @state') + fips["postal_lower"] = fips.state.str.lower() + state_sub = fips.query("postal_lower == @state") if state_sub.shape[0] == 0: - raise ValueError("You have likely entered an invalid state code, please revise.") + raise ValueError( + "You have likely entered an invalid state code, please revise." + ) else: state_fips = state_sub.state_code.unique()[0] - + if not quiet: print(f"Using FIPS code '{state_fips}' for input '{original_input}'") return state_fips else: # If a state name, grab the appropriate info from fips_codes - fips['name_lower'] = fips.state_name.str.lower() - state_sub = fips.query('name_lower == @state') + fips["name_lower"] = fips.state_name.str.lower() + state_sub = fips.query("name_lower == @state") if state_sub.shape[0] == 0: - raise ValueError("You have likely entered an invalid state code, please revise.") + raise ValueError( + "You have likely entered an invalid state code, please revise." + ) else: state_fips = state_sub.state_code.unique()[0] if not quiet: print(f"Using FIPS code '{state_fips}' for input '{original_input}'") - + return state_fips - -def validate_county(state, county, quiet = False): + + +def validate_county(state, county, quiet=False): state = validate_state(state) fips = fips_codes() - county_table = fips.query('state_code == @state') + county_table = fips.query("state_code == @state") # If they used numbers for the county: if county.isdigit(): # Left-pad with zeroes county.zfill(3) - + return county - + # Otherwise, if they pass a name: else: # Find counties in the table that could match - county_sub = county_table.query('county.str.contains(@county, flags = @re.IGNORECASE, regex = True)', - engine = 'python') + county_sub = county_table.query( + "county.str.contains(@county, flags = @re.IGNORECASE, regex = True)", + engine="python", + ) possible_counties = county_sub.county.unique() if len(possible_counties) == 0: raise ValueError("No county names match your input country string.") elif len(possible_counties) == 1: - - cty_code = (county_sub - .query('county == @possible_counties[0]') - .county_code - .unique()[0] - ) + cty_code = county_sub.query( + "county == @possible_counties[0]" + ).county_code.unique()[0] if not quiet: print(f"Using FIPS code '{cty_code}' for input '{county}'") return cty_code else: - msg = f"Your string matches {' and '.join(possible_counties)}. Please refine your selection." - - raise ValueError(msg) - - + msg = ( + f"Your string matches {' and '.join(possible_counties)}. " + "Please refine your selection." + ) + raise ValueError(msg) diff --git a/pygris/internal_data.py b/pygris/internal_data.py index 1e76f46..00b62e6 100644 --- a/pygris/internal_data.py +++ b/pygris/internal_data.py @@ -1,6 +1,7 @@ from importlib import resources + def fips_path(): with resources.path("pygris.internals", "fips_codes.csv") as path: data_file_path = path - return data_file_path \ No newline at end of file + return data_file_path diff --git a/pygris/legislative.py b/pygris/legislative.py index 0759084..5366581 100644 --- a/pygris/legislative.py +++ b/pygris/legislative.py @@ -2,11 +2,19 @@ __author__ = "Kyle Walker 2018 and cb: state = "us" - print("Retrieving state legislative districts for the entire United States.") + print( + "Retrieving state legislative districts for the entire United States." + ) else: - raise ValueError("A state must be specified for this year/dataset combination.") + raise ValueError( + "A state must be specified for this year/dataset combination." + ) else: state = validate_state(state) if house not in ["upper", "lower"]: - raise ValueError("You must specify either 'upper' or 'lower' as an argument for house.") - - if house == "lower": - type = "sldl" - else: - type = "sldu" + raise ValueError( + "You must specify either 'upper' or 'lower' as an argument for house." + ) + type = "sldl" if house == "lower" else "sldu" if cb: if year == 2010: @@ -208,13 +233,23 @@ def state_legislative_districts(state = None, house = "upper", cb = False, else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/{type.upper()}/tl_{year}_{state}_{type}.zip" - stateleg = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + stateleg = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) return stateleg -def voting_districts(state = None, county = None, cb = False, - year = 2020, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def voting_districts( + state=None, + county=None, + cb=False, + year=2020, + cache=False, + subset_by=None, + protocol="http", + timeout=1800, +): """ Load a voting districts shapefile into Python as a GeoDataFrame @@ -232,7 +267,8 @@ def voting_districts(state = None, county = None, cb = False, Defaults to False (the regular TIGER/Line file). year : int The year of the TIGER/Line or cartographic boundary shapefile. Available years - for voting districts are 2020 (for 2020 districts) and 2012 (for 2010 districts). + for voting districts are 2020 (for 2020 districts) and 2012 + (for 2010 districts). cache : bool If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load @@ -259,42 +295,47 @@ def voting_districts(state = None, county = None, cb = False, The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of voting districts. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch14GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch14GARM.pdf for more + information. """ - if year != 2020 and cb: - raise ValueError("Cartographic boundary voting district files are only available for 2020.") + raise ValueError( + "Cartographic boundary voting district files are only available for 2020." + ) if state is None: if year > 2018 and cb: state = "us" print("Retrieving voting districts for the entire United States") else: - raise ValueError("A state must be specified for this year/dataset combination.") + raise ValueError( + "A state must be specified for this year/dataset combination." + ) else: state = validate_state(state) - if cb: url = f"https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_{state}_vtd_500k.zip" - vtds = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + vtds = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) if county is None: return vtds else: if type(county) is not list: county = [county] - valid_county = [validate_county(state, x) for x in county] - vtds = vtds.query('COUNTYFP20 in @valid_county') + valid_county = [validate_county(state, x) for x in county] # noqa: F841 + vtds = vtds.query("COUNTYFP20 in @valid_county") return vtds else: @@ -307,6 +348,8 @@ def voting_districts(state = None, county = None, cb = False, else: url = f"https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/VTD/2020/tl_2020_{state}_vtd20.zip" - vtds = _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + vtds = _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) return vtds diff --git a/pygris/metro_areas.py b/pygris/metro_areas.py index 65960c5..828c29c 100644 --- a/pygris/metro_areas.py +++ b/pygris/metro_areas.py @@ -4,58 +4,67 @@ from pygris.helpers import _load_tiger -def core_based_statistical_areas(cb = False, resolution = "500k", year = None, cache = False, protocol = "http", timeout = 1800): + +def core_based_statistical_areas( + cb=False, resolution="500k", year=None, cache=False, protocol="http", timeout=1800 +): """ Load a core-based statistical areas shapefile into Python as a GeoDataFrame Parameters ---------- cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). resolution : str - The resolution of the cartographic boundary file; only applies if + The resolution of the cartographic boundary file; only applies if the cb argument is set to True. The default is "500k"; options also - include "5m" (1:5 million) and "20m" (1:20 million) - year : int + include "5m" (1:5 million) and "20m" (1:20 million) + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of core-based statistical areas Notes - ---------- - Core-based statistical areas include metropolitan and micropolitan statistical areas. - See https://www.census.gov/programs-surveys/metro-micro.html for more information. + ----- + Core-based statistical areas include metropolitan and micropolitan statistical + areas. See https://www.census.gov/programs-surveys/metro-micro.html for more + information. """ if year is None: year = 2024 print(f"Using the default year of {year}") - + if resolution not in ["500k", "5m", "20m"]: - raise ValueError("Invalid value for resolution. Valid values are '500k', '5m', and '20m'.") + raise ValueError( + "Invalid value for resolution. Valid values are '500k', '5m', and '20m'." + ) if year == 2022: - raise ValueError("CBSAs for 2022 are not yet defined due to the re-organization of counties in Connecticut.") - + raise ValueError( + "CBSAs for 2022 are not yet defined due to the re-organization of " + "counties in Connecticut." + ) + if cb: if year == 2010: if resolution == "5m": raise ValueError("`resolution = '5m' is unavailable for 2010.") - + url = f"https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_310_m1_{resolution}.zip" else: if year == 2013: @@ -64,104 +73,112 @@ def core_based_statistical_areas(cb = False, resolution = "500k", year = None, c url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_cbsa_{resolution}.zip" else: if year == 2010: - url = f"https://www2.census.gov/geo/tiger/TIGER2010/CBSA/2010/tl_2010_us_cbsa10.zip" + url = "https://www2.census.gov/geo/tiger/TIGER2010/CBSA/2010/tl_2010_us_cbsa10.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/CBSA/tl_{year}_us_cbsa.zip" - - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) -def urban_areas(cb = False, year = None, cache = False, protocol = "http", timeout = 1800): + +def urban_areas(cb=False, year=None, cache=False, protocol="http", timeout=1800): """ Load a urbanized areas shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. - Defaults to False (the regular TIGER/Line file). - year : int + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. + Defaults to False (the regular TIGER/Line file). + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of urbanized areas Notes - ---------- - Urbanized areas are not yet defined for 2020; shapefiles use the old 2010 definitions. - See https://www.census.gov/programs-surveys/geography/guidance/geo-areas/urban-rural.html for more information. + ----- + Urbanized areas are not yet defined for 2020; shapefiles use the old 2010 + definitions. See https://www.census.gov/programs-surveys/geography/guidance/geo-areas/urban-rural.html + for more information. """ if year is None: year = 2024 print(f"Using the default year of {year}") - + if cb: if year == 2013: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/cb_{year}_us_ua10_500k.zip" else: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_ua10_500k.zip" else: - url = f"https://www2.census.gov/geo/tiger/TIGER{year}/UAC/tl_{year}_us_uac10.zip" - - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + url = ( + f"https://www2.census.gov/geo/tiger/TIGER{year}/UAC/tl_{year}_us_uac10.zip" + ) + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) -def combined_statistical_areas(cb = False, resolution = "500k", year = None, cache = False, protocol = "http", timeout = 1800): + +def combined_statistical_areas( + cb=False, resolution="500k", year=None, cache=False, protocol="http", timeout=1800 +): """ Load a combined statistical areas shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - resolution : str - The resolution of the cartographic boundary file; only applies if + resolution : str + The resolution of the cartographic boundary file; only applies if the cb argument is set to True. The default is "500k"; options also - include "5m" (1:5 million) and "20m" (1:20 million) - year : int + include "5m" (1:5 million) and "20m" (1:20 million) + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of urbanized areas Notes - ---------- - See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf + for more information. """ if year is None: year = 2024 print(f"Using the default year of {year}") - + if resolution not in ["500k", "5m", "20m"]: - raise ValueError("Invalid value for resolution. Valid values are '500k', '5m', and '20m'.") - + raise ValueError( + "Invalid value for resolution. Valid values are '500k', '5m', and '20m'." + ) + if cb: if year == 2013: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/cb_{year}_us_csa_{resolution}.zip" @@ -169,56 +186,64 @@ def combined_statistical_areas(cb = False, resolution = "500k", year = None, cac url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_csa_{resolution}.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/CSA/tl_{year}_us_csa.zip" - - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) -def metro_divisions(cb = False, resolution = "500k", year = None, cache = False, protocol = "http", timeout = 1800): +def metro_divisions( + cb=False, resolution="500k", year=None, cache=False, protocol="http", timeout=1800 +): """ Load a metropolitan divisions shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - resolution : str - The resolution of the cartographic boundary file; only applies if + resolution : str + The resolution of the cartographic boundary file; only applies if the cb argument is set to True. The default is "500k"; options also - include "5m" (1:5 million) and "20m" (1:20 million) - year : int + include "5m" (1:5 million) and "20m" (1:20 million) + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of metropolitan divisions Notes - ---------- - See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf + for more information. """ if year is None: year = 2024 print(f"Using the default year of {year}") - + if resolution not in ["500k", "5m", "20m"]: - raise ValueError("Invalid value for resolution. Valid values are '500k', '5m', and '20m'.") + raise ValueError( + "Invalid value for resolution. Valid values are '500k', '5m', and '20m'." + ) if year == 2022: - raise ValueError("Metropolitan divisions for 2022 are not yet defined due to the re-organization of counties in Connecticut.") - + raise ValueError( + "Metropolitan divisions for 2022 are not yet defined due to the " + "re-organization of counties in Connecticut." + ) + if cb: if year == 2013: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/cb_{year}_us_metdiv_{resolution}.zip" @@ -226,67 +251,73 @@ def metro_divisions(cb = False, resolution = "500k", year = None, cache = False, url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_metdiv_{resolution}.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/METDIV/tl_{year}_us_metdiv.zip" - - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) -def new_england(type = "necta", cb = False, year = None, cache = False, protocol = "http", timeout = 1800): + +def new_england( + type="necta", cb=False, year=None, cache=False, protocol="http", timeout=1800 +): """ Load a metropolitan divisions shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - resolution : str - The resolution of the cartographic boundary file; only applies if + resolution : str + The resolution of the cartographic boundary file; only applies if the cb argument is set to True. The default is "500k"; options also - include "5m" (1:5 million) and "20m" (1:20 million) - year : int + include "5m" (1:5 million) and "20m" (1:20 million) + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of metropolitan divisions Notes - ---------- - See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2020/TGRSHP2020_TechDoc.pdf + for more information. """ if year is None: year = 2024 print(f"Using the default year of {year}") - + if type == "necta": if cb: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_necta_500k.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/NECTA/tl_{year}_us_necta.zip" - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) elif type == "combined": url = f"https://www2.census.gov/geo/tiger/TIGER{year}/CNECTA/tl_{year}_us_cnecta.zip" - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) elif type == "divisions": url = f"https://www2.census.gov/geo/tiger/TIGER{year}/NECTADIV/tl_{year}_us_nectadiv.zip" - return _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + return _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) else: - raise ValueError("Invalid NECTA type; valid values include 'necta' (the default), 'combined', and 'divisions'.") + raise ValueError( + "Invalid NECTA type; valid values include 'necta' (the default), " + "'combined', and 'divisions'." + ) diff --git a/pygris/national.py b/pygris/national.py index 31afaed..196e8cb 100644 --- a/pygris/national.py +++ b/pygris/national.py @@ -4,130 +4,134 @@ from pygris.helpers import _load_tiger -def regions(resolution = "500k", year = None, cache = False, protocol = "http", timeout = 1800): + +def regions(resolution="500k", year=None, cache=False, protocol="http", timeout=1800): """ Load a US Census regions shapefile into Python as a GeoDataFrame Parameters ---------- resolution : str - The resolution of the cartographic boundary file; only applies if + The resolution of the cartographic boundary file; only applies if the cb argument is set to True. The default is "500k"; options also include "5m" (1:5 million) and "20m" (1:20 million) - - year : int + + year : int The year of the cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of US regions. """ - if year is None: year = 2024 print(f"Using the default year of {year}") - + if resolution not in ["500k", "5m", "20m"]: - raise ValueError("Invalid value for resolution. Valid values are '500k', '5m', and '20m'.") + raise ValueError( + "Invalid value for resolution. Valid values are '500k', '5m', and '20m'." + ) url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_region_{resolution}.zip" - rgns = _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + rgns = _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) return rgns -def nation(resolution = "5m", year = None, cache = False, protocol = "http", timeout = 1800): +def nation(resolution="5m", year=None, cache=False, protocol="http", timeout=1800): """ Load a US national boundary shapefile into Python as a GeoDataFrame Parameters ---------- resolution : str - The resolution of the cartographic boundary file; only applies if - the cb argument is set to True. The default is "5m" (1:5 million); + The resolution of the cartographic boundary file; only applies if + the cb argument is set to True. The default is "5m" (1:5 million); "20m" (1:20 million) is also available. - - year : int + + year : int The year of the cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of the US boundary. """ - if year is None: year = 2024 print(f"Using the default year of {year}") - + if resolution not in ["5m", "20m"]: - raise ValueError("Invalid value for resolution. Valid values are '500k', '5m', and '20m'.") + raise ValueError( + "Invalid value for resolution. Valid values are '500k', '5m', and '20m'." + ) url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_nation_{resolution}.zip" - nat = _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + nat = _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) return nat -def divisions(resolution = "500k", year = None, cache = False, protocol = "http", timeout = 1800): +def divisions(resolution="500k", year=None, cache=False, protocol="http", timeout=1800): """ Load a US Census divisions shapefile into Python as a GeoDataFrame Parameters ---------- resolution : str - The resolution of the cartographic boundary file; only applies if + The resolution of the cartographic boundary file; only applies if the cb argument is set to True. The default is "500k"; options also include "5m" (1:5 million) and "20m" (1:20 million) - - year : int + + year : int The year of the cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of US Census divisions. """ - if year is None: year = 2024 print(f"Using the default year of {year}") - + if resolution not in ["500k", "5m", "20m"]: - raise ValueError("Invalid value for resolution. Valid values are '500k', '5m', and '20m'.") + raise ValueError( + "Invalid value for resolution. Valid values are '500k', '5m', and '20m'." + ) url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_division_{resolution}.zip" - div = _load_tiger(url, cache = cache, protocol = protocol, timeout = timeout) + div = _load_tiger(url, cache=cache, protocol=protocol, timeout=timeout) - return div \ No newline at end of file + return div diff --git a/pygris/native.py b/pygris/native.py index 64a6d43..0f9e496 100644 --- a/pygris/native.py +++ b/pygris/native.py @@ -1,304 +1,334 @@ from pygris.helpers import _load_tiger -def native_areas(cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): + +def native_areas( + cb=False, year=None, cache=False, subset_by=None, protocol="http", timeout=1800 +): """ - Load an American Indian / Alaska Native / Native Hawaiian areas shapefile into Python as a GeoDataFrame + Load an American Indian / Alaska Native / Native Hawaiian areas shapefile into + Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- - geopandas.GeoDataFrame: A GeoDataFrame of American Indian / Alaska Native / Native Hawaiian areas. + ------- + geopandas.GeoDataFrame + A GeoDataFrame of American Indian / Alaska Native / Native Hawaiian areas. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more + information. """ if year is None: print("Using the default year of 2024") year = 2024 - + if cb: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_aiannh_500k.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/AIANNH/tl_{year}_us_aiannh.zip" - - return _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + + return _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) -def tribal_subdivisions_national(cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def tribal_subdivisions_national( + cb=False, year=None, cache=False, subset_by=None, protocol="http", timeout=1800 +): """ - Load an American Indian Tribal Subdivision National shapefile into Python as a GeoDataFrame + Load an American Indian Tribal Subdivision National shapefile into Python as a + GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of American Indian Tribal Subdivisions. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more + information. """ if year is None: print("Using the default year of 2024") year = 2024 - + if cb: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_aitsn_500k.zip" else: - if year < 2015: + if year < 2015: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/AITS/tl_{year}_us_aitsn.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/AITSN/tl_{year}_us_aitsn.zip" - return _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + return _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) -def alaska_native_regional_corporations(cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def alaska_native_regional_corporations( + cb=False, year=None, cache=False, subset_by=None, protocol="http", timeout=1800 +): """ Load an Alaska Native Regional Corporation shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of Alaska Native Regional Corporations. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more + information. """ if year is None: print("Using the default year of 2024") year = 2024 - + if cb: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_02_anrc_500k.zip" else: - url = f"https://www2.census.gov/geo/tiger/TIGER{year}/ANRC/tl_{year}_02_anrc.zip" - - return _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + url = ( + f"https://www2.census.gov/geo/tiger/TIGER{year}/ANRC/tl_{year}_02_anrc.zip" + ) + + return _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) -def tribal_block_groups(cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def tribal_block_groups( + cb=False, year=None, cache=False, subset_by=None, protocol="http", timeout=1800 +): """ Load a Tribal block groups shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of Tribal block groups. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more + information. """ if year is None: print("Using the default year of 2024") year = 2024 - + if cb: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_tbg_500k.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/TBG/tl_{year}_us_tbg.zip" - - return _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) + return _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) -def tribal_tracts(cb = False, year = None, cache = False, subset_by = None, protocol = "http", timeout = 1800): +def tribal_tracts( + cb=False, year=None, cache=False, subset_by=None, protocol="http", timeout=1800 +): """ Load a Tribal Census tracts shapefile into Python as a GeoDataFrame Parameters ---------- - cb : bool - If set to True, download a generalized (1:500k) cartographic boundary file. + cb : bool + If set to True, download a generalized (1:500k) cartographic boundary file. Defaults to False (the regular TIGER/Line file). - year : int + year : int The year of the TIGER/Line or cartographic boundary shapefile. If not specified, defaults to 2024. - cache : bool - If True, the function will download a Census shapefile to a cache directory + cache : bool + If True, the function will download a Census shapefile to a cache directory on the user's computer for future access. If False, the function will load - the shapefile directly from the Census website. + the shapefile directly from the Census website. subset_by : tuple, int, slice, dict, geopandas.GeoDataFrame, or geopandas.GeoSeries - An optional directive telling pygris to return a subset of data using - underlying arguments in geopandas.read_file(). + An optional directive telling pygris to return a subset of data using + underlying arguments in geopandas.read_file(). subset_by operates as follows: - * If a user supplies a tuple of format (minx, miny, maxx, maxy), + * If a user supplies a tuple of format (minx, miny, maxx, maxy), it will be interpreted as a bounding box and rows will be returned that intersect that bounding box; * If a user supplies a integer or a slice object, the first n rows (or the rows defined by the slice object) will be returned; * If a user supplies an object of type geopandas.GeoDataFrame - or of type geopandas.GeoSeries, rows that intersect the input - object will be returned. CRS misalignment will be resolved - internally. + or of type geopandas.GeoSeries, rows that intersect the input + object will be returned. CRS misalignment will be resolved + internally. * A dict of format {"address": "buffer_distance"} will return rows - that intersect a buffer of a given distance (in meters) around an - input address. + that intersect a buffer of a given distance (in meters) around an + input address. protocol : str The protocol to use for downloading the file. Defaults to "http". timeout : int The timeout for the download request in seconds. Defaults to 1800 (30 minutes). Returns - ---------- + ------- geopandas.GeoDataFrame: A GeoDataFrame of Tribal Census tracts. Notes - ---------- - See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more information. + ----- + See https://www2.census.gov/geo/pdfs/reference/GARM/Ch5GARM.pdf for more + information. """ if year is None: print("Using the default year of 2024") year = 2024 - + if cb: url = f"https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_ttract_500k.zip" else: url = f"https://www2.census.gov/geo/tiger/TIGER{year}/TTRACT/tl_{year}_us_ttract.zip" - - return _load_tiger(url, cache = cache, subset_by = subset_by, protocol = protocol, timeout = timeout) \ No newline at end of file + + return _load_tiger( + url, cache=cache, subset_by=subset_by, protocol=protocol, timeout=timeout + ) diff --git a/pygris/transportation.py b/pygris/transportation.py index 6f9ac82..c49c4de 100644 --- a/pygris/transportation.py +++ b/pygris/transportation.py @@ -2,332 +2,359 @@ __author__ = "Kyle Walker = @area_threshold') + water_thresh = all_water.query("water_rank >= @area_threshold") # Erase the water area - erased = input.overlay(water_thresh, how = "difference") + erased = input.overlay(water_thresh, how="difference") return erased -def shift_geometry(input, geoid_column = None, preserve_area = False, position = "below"): +def shift_geometry(input, geoid_column=None, preserve_area=False, position="below"): """ Shift and optionally rescale Alaska, Hawaii, and Puerto Rico for better cartographic display Parameters - --------------- + ---------- input : geopandas.GeoDataFrame - A dataset of features in the United States to shift / rescale. + A dataset of features in the United States to shift / rescale. geoid_column : str, optional - An optional column in the dataset that provides a state FIPS code. If used, avoids spatial - overlay to identify features and can speed processing. + An optional column in the dataset that provides a state FIPS code. If used, avoids spatial + overlay to identify features and can speed processing. preserve_area : bool - Whether or not to preserve the area of Alaska, Hawaii, and Puerto Rico when re-arranging - features. If False, Alaska will be shrunk to about half its size; Hawaii will be - rescaled to 1.5x its size, and Puerto Rico will be rescaled to 2.5x its size. - If True, sizes of Alaska, Hawaii, and Puerto Rico relative to the continental United - States will be preserved. Defaults to False. + Whether or not to preserve the area of Alaska, Hawaii, and Puerto Rico when re-arranging + features. If False, Alaska will be shrunk to about half its size; Hawaii will be + rescaled to 1.5x its size, and Puerto Rico will be rescaled to 2.5x its size. + If True, sizes of Alaska, Hawaii, and Puerto Rico relative to the continental United + States will be preserved. Defaults to False. position : str - One of "below" (the default), which moves features in Alaska, Hawaii, and Puerto Rico below the - continental United States; or "outside", which places features outside the - continental US in locations that correspond roughly to their actual geographic - positions. + One of "below" (the default), which moves features in Alaska, Hawaii, and Puerto Rico below the + continental United States; or "outside", which places features outside the + continental US in locations that correspond roughly to their actual geographic + positions. Returns - ----------- + ------- The original input dataset with shifted / rescaled geometry. Notes - ----------- + ----- `shift_geometry()`, while designed for use with objects from the pygris package, will work with any US dataset. If aligning datasets from multiple sources, you must take care to ensure that your options specified in `preserve_area` and `position` are identical across layers. Otherwise your layers @@ -136,56 +143,64 @@ def shift_geometry(input, geoid_column = None, preserve_area = False, position = Data Visualization (https://clauswilke.com/dataviz/geospatial-data.html); Bob Rudis's albersusa R package (https://github.com/hrbrmstr/albersusa); and the ggcart R package (https://uncoast-unconf.github.io/ggcart/). - - """ - minimal_states = states(cb = True, resolution = "20m", year = 2021).to_crs('ESRI:102003') + """ # noqa: E501 + minimal_states = states(cb=True, resolution="20m", year=2021).to_crs("ESRI:102003") - ak_bbox = gp.GeoDataFrame(geometry = minimal_states.query("GEOID == '02'").envelope) - hi_bbox = gp.GeoDataFrame(geometry = minimal_states.query("GEOID == '15'").envelope) - pr_bbox = gp.GeoDataFrame(geometry = minimal_states.query("GEOID == '72'").envelope) + ak_bbox = gp.GeoDataFrame(geometry=minimal_states.query("GEOID == '02'").envelope) + hi_bbox = gp.GeoDataFrame(geometry=minimal_states.query("GEOID == '15'").envelope) + pr_bbox = gp.GeoDataFrame(geometry=minimal_states.query("GEOID == '72'").envelope) boxes = pd.concat([ak_bbox, hi_bbox, pr_bbox]) - boxes['state_fips'] = ['02', '15', '72'] + boxes["state_fips"] = ["02", "15", "72"] input_albers = input.to_crs(minimal_states.crs) if geoid_column is not None: - input_albers['state_fips'] = input_albers[geoid_column].str.slice(0, 2) + input_albers["state_fips"] = input_albers[geoid_column].str.slice(0, 2) else: - input_albers = input_albers.sjoin(boxes, how = "left") + input_albers = input_albers.sjoin(boxes, how="left") - input_albers['state_fips'] = input_albers['state_fips'].fillna("00") - - # Alaska/Hawaii/PR centroids are necessary to put any dataset in the correct location + input_albers["state_fips"] = input_albers["state_fips"].fillna("00") + + # Alaska/Hawaii/PR centroids are necessary to put any dataset in the correct + # location ak_crs = 3338 - hi_crs = 'ESRI:102007' + hi_crs = "ESRI:102007" pr_crs = 32161 ak_centroid = minimal_states.query("GEOID == '02'").to_crs(ak_crs).centroid hi_centroid = minimal_states.query("GEOID == '15'").to_crs(hi_crs).centroid pr_centroid = minimal_states.query("GEOID == '72'").to_crs(pr_crs).centroid - def place_geometry_wilke(geometry, position, centroid, scale = 1): - centroid_x = centroid.x.values[0] - centroid_y = centroid.y.values[0] - diff = geometry.translate(xoff = -centroid_x, yoff = -centroid_y) - scaled = diff.scale(xfact = scale, yfact = scale, origin = (centroid_x, centroid_y)) - return scaled.translate(xoff = position[0], yoff = position[1]) + def place_geometry_wilke(geometry, position, centroid, scale=1): + centroid_x = centroid.x.to_numpy()[0] + centroid_y = centroid.y.to_numpy()[0] + diff = geometry.translate(xoff=-centroid_x, yoff=-centroid_y) + scaled = diff.scale(xfact=scale, yfact=scale, origin=(centroid_x, centroid_y)) + return scaled.translate(xoff=position[0], yoff=position[1]) - bb = minimal_states.query('GEOID not in ["02", "15", "72"]', engine = "python").total_bounds + bb = minimal_states.query( + 'GEOID not in ["02", "15", "72"]', engine="python" + ).total_bounds - us_lower48 = input_albers.query('state_fips not in ["02", "15", "72"]', engine = "python") + us_lower48 = input_albers.query( + 'state_fips not in ["02", "15", "72"]', engine="python" + ) us_alaska = input_albers.query('state_fips == "02"') us_hawaii = input_albers.query('state_fips == "15"') us_puerto_rico = input_albers.query('state_fips == "72"') if pd.concat([us_alaska, us_hawaii, us_puerto_rico]).shape[0] == 0: - UserWarning("None of your features are in Alaska, Hawaii, or Puerto Rico, so no geometries will be shifted.\nTransforming your object's CRS to 'ESRI:102003'") - return input_albers.drop(['state_fips', 'index_right'], axis = 1) - + UserWarning( + "None of your features are in Alaska, Hawaii, or Puerto Rico, so no " + "geometries will be shifted.\nTransforming your object's " + "CRS to 'ESRI:102003'" + ) + return input_albers.drop(["state_fips", "index_right"], axis=1) + shapes_list = [us_lower48] if not preserve_area: @@ -194,40 +209,55 @@ def place_geometry_wilke(geometry, position, centroid, scale = 1): if position == "below": ak_rescaled.geometry = place_geometry_wilke( - geometry = ak_rescaled.geometry, - position = [bb[0] + 0.06 * (bb[2] - bb[0]), bb[1] - 0.14 * (bb[3] - bb[1])], - scale = 0.5, - centroid = ak_centroid) + geometry=ak_rescaled.geometry, + position=[ + bb[0] + 0.06 * (bb[2] - bb[0]), + bb[1] - 0.14 * (bb[3] - bb[1]), + ], + scale=0.5, + centroid=ak_centroid, + ) elif position == "outside": ak_rescaled.geometry = place_geometry_wilke( - geometry = ak_rescaled.geometry, - position = [bb[0] - 0.08 * (bb[2] - bb[0]), bb[1] + 0.92 * (bb[3] - bb[1])], - scale = 0.5, - centroid = ak_centroid) - - ak_rescaled.set_crs('ESRI:102003', inplace = True, allow_override = True) + geometry=ak_rescaled.geometry, + position=[ + bb[0] - 0.08 * (bb[2] - bb[0]), + bb[1] + 0.92 * (bb[3] - bb[1]), + ], + scale=0.5, + centroid=ak_centroid, + ) + + ak_rescaled.set_crs("ESRI:102003", inplace=True, allow_override=True) shapes_list.append(ak_rescaled) - + if us_hawaii.shape[0] > 0: - hi_rescaled = us_hawaii.overlay(hi_bbox).to_crs(hi_crs) if position == "below": hi_rescaled.geometry = place_geometry_wilke( - geometry = hi_rescaled.geometry, - position = [bb[0] + 0.32 * (bb[2] - bb[0]), bb[1] + 0.2 * (bb[3] - bb[1])], - scale = 1.5, - centroid = hi_centroid) + geometry=hi_rescaled.geometry, + position=[ + bb[0] + 0.32 * (bb[2] - bb[0]), + bb[1] + 0.2 * (bb[3] - bb[1]), + ], + scale=1.5, + centroid=hi_centroid, + ) elif position == "outside": hi_rescaled.geometry = place_geometry_wilke( - geometry = hi_rescaled.geometry, - position = [bb[0] + 0.05 * (bb[2] - bb[0]), bb[1] + 0.35 * (bb[3] - bb[1])], - scale = 1.5, - centroid = hi_centroid) - - hi_rescaled.set_crs('ESRI:102003', inplace = True, allow_override = True) + geometry=hi_rescaled.geometry, + position=[ + bb[0] + 0.05 * (bb[2] - bb[0]), + bb[1] + 0.35 * (bb[3] - bb[1]), + ], + scale=1.5, + centroid=hi_centroid, + ) + + hi_rescaled.set_crs("ESRI:102003", inplace=True, allow_override=True) shapes_list.append(hi_rescaled) @@ -236,22 +266,30 @@ def place_geometry_wilke(geometry, position, centroid, scale = 1): if position == "below": pr_rescaled.geometry = place_geometry_wilke( - geometry = pr_rescaled.geometry, - position = [bb[0] + 0.75 * (bb[2] - bb[0]), bb[1] + 0.15 * (bb[3] - bb[1])], - scale = 2.5, - centroid = pr_centroid) + geometry=pr_rescaled.geometry, + position=[ + bb[0] + 0.75 * (bb[2] - bb[0]), + bb[1] + 0.15 * (bb[3] - bb[1]), + ], + scale=2.5, + centroid=pr_centroid, + ) elif position == "outside": pr_rescaled.geometry = place_geometry_wilke( - geometry = pr_rescaled.geometry, - position = [bb[0] + 1.0 * (bb[2] - bb[0]), bb[1] + 0.05 * (bb[3] - bb[1])], - scale = 2.5, - centroid = pr_centroid) - - pr_rescaled.set_crs('ESRI:102003', inplace = True, allow_override = True) + geometry=pr_rescaled.geometry, + position=[ + bb[0] + 1.0 * (bb[2] - bb[0]), + bb[1] + 0.05 * (bb[3] - bb[1]), + ], + scale=2.5, + centroid=pr_centroid, + ) + + pr_rescaled.set_crs("ESRI:102003", inplace=True, allow_override=True) shapes_list.append(pr_rescaled) - - output_data = pd.concat(shapes_list).drop(['state_fips', 'index_right'], axis = 1) + + output_data = pd.concat(shapes_list).drop(["state_fips", "index_right"], axis=1) return output_data @@ -261,39 +299,55 @@ def place_geometry_wilke(geometry, position, centroid, scale = 1): if position == "below": ak_rescaled.geometry = place_geometry_wilke( - geometry = ak_rescaled.geometry, - position = [bb[0] + 0.2 * (bb[2] - bb[0]), bb[1] - 0.13 * (bb[3] - bb[1])], - scale = 1, - centroid = ak_centroid) + geometry=ak_rescaled.geometry, + position=[ + bb[0] + 0.2 * (bb[2] - bb[0]), + bb[1] - 0.13 * (bb[3] - bb[1]), + ], + scale=1, + centroid=ak_centroid, + ) elif position == "outside": ak_rescaled.geometry = place_geometry_wilke( - geometry = ak_rescaled.geometry, - position = [bb[0] - 0.25 * (bb[2] - bb[0]), bb[1] + 1.35 * (bb[3] - bb[1])], - scale = 1, - centroid = ak_centroid) - - ak_rescaled.set_crs('ESRI:102003', inplace = True, allow_override = True) + geometry=ak_rescaled.geometry, + position=[ + bb[0] - 0.25 * (bb[2] - bb[0]), + bb[1] + 1.35 * (bb[3] - bb[1]), + ], + scale=1, + centroid=ak_centroid, + ) + + ak_rescaled.set_crs("ESRI:102003", inplace=True, allow_override=True) shapes_list.append(ak_rescaled) - + if us_hawaii.shape[0] > 0: hi_rescaled = us_hawaii.overlay(hi_bbox).to_crs(hi_crs) if position == "below": hi_rescaled.geometry = place_geometry_wilke( - geometry = hi_rescaled.geometry, - position = [bb[0] + 0.6 * (bb[2] - bb[0]), bb[1] - 0.1 * (bb[3] - bb[1])], - scale = 1, - centroid = hi_centroid) + geometry=hi_rescaled.geometry, + position=[ + bb[0] + 0.6 * (bb[2] - bb[0]), + bb[1] - 0.1 * (bb[3] - bb[1]), + ], + scale=1, + centroid=hi_centroid, + ) elif position == "outside": hi_rescaled.geometry = place_geometry_wilke( - geometry = hi_rescaled.geometry, - position = [bb[0] - 0.0 * (bb[2] - bb[0]), bb[1] + 0.2 * (bb[3] - bb[1])], - scale = 1, - centroid = hi_centroid) - - hi_rescaled.set_crs('ESRI:102003', inplace = True, allow_override = True) + geometry=hi_rescaled.geometry, + position=[ + bb[0] - 0.0 * (bb[2] - bb[0]), + bb[1] + 0.2 * (bb[3] - bb[1]), + ], + scale=1, + centroid=hi_centroid, + ) + + hi_rescaled.set_crs("ESRI:102003", inplace=True, allow_override=True) shapes_list.append(hi_rescaled) @@ -302,21 +356,29 @@ def place_geometry_wilke(geometry, position, centroid, scale = 1): if position == "below": pr_rescaled.geometry = place_geometry_wilke( - geometry = pr_rescaled.geometry, - position = [bb[0] + 0.75 * (bb[2] - bb[0]), bb[1] - 0.1 * (bb[3] - bb[1])], - scale = 1, - centroid = pr_centroid) + geometry=pr_rescaled.geometry, + position=[ + bb[0] + 0.75 * (bb[2] - bb[0]), + bb[1] - 0.1 * (bb[3] - bb[1]), + ], + scale=1, + centroid=pr_centroid, + ) elif position == "outside": pr_rescaled.geometry = place_geometry_wilke( - geometry = pr_rescaled.geometry, - position = [bb[0] + 0.95 * (bb[2] - bb[0]), bb[1] - 0.05 * (bb[3] - bb[1])], - scale = 1, - centroid = pr_centroid) - - pr_rescaled.set_crs('ESRI:102003', inplace = True, allow_override = True) + geometry=pr_rescaled.geometry, + position=[ + bb[0] + 0.95 * (bb[2] - bb[0]), + bb[1] - 0.05 * (bb[3] - bb[1]), + ], + scale=1, + centroid=pr_centroid, + ) + + pr_rescaled.set_crs("ESRI:102003", inplace=True, allow_override=True) shapes_list.append(pr_rescaled) - - output_data = pd.concat(shapes_list).drop(['state_fips', 'index_right'], axis = 1) + + output_data = pd.concat(shapes_list).drop(["state_fips", "index_right"], axis=1) return output_data diff --git a/pygris/water.py b/pygris/water.py index 7032b6b..c197101 100644 --- a/pygris/water.py +++ b/pygris/water.py @@ -2,56 +2,62 @@ __author__ = "Kyle Walker =0.9 requests -appdirs \ No newline at end of file +appdirs