AlabamaWaterInstitute · jameshalgren · Jan 25, 2023 · Jan 25, 2023 · Jan 26, 2023 · Jan 26, 2023
diff --git a/kerchunk/test_time_kerchunk_singleForecast_nativeNWM.ipynb b/kerchunk/test_time_kerchunk_singleForecast_nativeNWM.ipynb
@@ -0,0 +1,298 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82a662bf-4aad-437d-984b-cb54ed8abbcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "import fsspec\n",
+    "import ujson\n",
+    "from kerchunk.hdf import SingleHdf5ToZarr\n",
+    "from kerchunk.combine import MultiZarrToZarr\n",
+    "import xarray as xr\n",
+    "import dask\n",
+    "import hvplot.xarray\n",
+    "from datetime import datetime, timedelta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55bdc833-39d8-41e3-a956-54c0d60c55b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import subprocess\n",
+    "\n",
+    "try:\n",
+    "    import google.colab\n",
+    "\n",
+    "    ENV_IS_CL = True\n",
+    "    subprocess.run(\n",
+    "        [\n",
+    "            \"git\",\n",
+    "            \"clone\",\n",
+    "            \"https://github.com/AlabamaWaterInstitute/data_access_examples\",\n",
+    "        ]\n",
+    "    )\n",
+    "    sys.path.append(\"/content/data_access_examples\")\n",
+    "except:\n",
+    "    ENV_IS_CL = False\n",
+    "    sys.path.append(r\"..\")\n",
+    "    sys.path.append(r\"../data_access_examples\")\n",
+    "    sys.path.append(r\"git\")\n",
+    "\n",
+    "print(sys.path[0])\n",
+    "import nwm_filenames.listofnwmfilenames as lnf\n",
+    "from nwm_network.NWM_2_1_outlets import outlets_sorted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb423afb-67e8-4fe3-adc8-888f15303edb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gen_json(files):\n",
+    "    open_files = fsspec.open_files(files)\n",
+    "    out = []\n",
+    "    for file in open_files:\n",
+    "        with file as f:\n",
+    "            out.append(SingleHdf5ToZarr(f, file.path).translate())\n",
+    "\n",
+    "    mzz = MultiZarrToZarr(\n",
+    "        out,\n",
+    "        remote_protocol=\"gcs\",\n",
+    "        concat_dims=[\"time\", \"reference_time\"],\n",
+    "    )\n",
+    "\n",
+    "    tot = mzz.translate()\n",
+    "    return tot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70027348-78a1-497f-9089-7b8948d63d72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nwm_filenames.listofnwmfilenames as lnf\n",
+    "\n",
+    "configs = [\n",
+    "    (1, 1, 1, -1),  # Short_range\n",
+    "    (2, 1, 1, 1),  # Medium range mem_1\n",
+    "    (2, 1, 1, 2),  # Medium range mem_2\n",
+    "    (2, 1, 1, 3),  # Medium range mem_3\n",
+    "    (2, 1, 1, 4),  # Medium range mem_4\n",
+    "    (2, 1, 1, 5),  # Medium range mem_5\n",
+    "    (2, 1, 1, 6),  # Medium range mem_6\n",
+    "    (2, 1, 1, 7),  # Medium range mem_7\n",
+    "    (3, 1, 1, -1),  # Medium range no_da\n",
+    "]\n",
+    "\n",
+    "prefix = \"\"\n",
+    "start_date = \"20221201\"\n",
+    "end_date = \"20221201\"\n",
+    "configuration_list = []\n",
+    "configuration_list.extend([(*_c, start_date, end_date, [0], 5) for _c in configs])\n",
+    "print(configuration_list)\n",
+    "\n",
+    "file_collections = [lnf.create_file_list(*_c) for _c in configuration_list]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23dc327e-8bf1-4c0d-8d86-3da9bedd6e09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_collections[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f0a7a64-cdaf-44ca-ab1c-8ba3174d72c8",
+   "metadata": {},
+   "source": [
+    "### Generate plot data for one random stream segment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "306f25c4-0efe-4084-9bbc-7099df31d8b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from time import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c8c919d-8a49-42c5-9393-19cf8130b989",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "# id_list = 101\n",
+    "id_list = [22811611]  # Mississippi River outlet\n",
+    "# id_list = [22811611, 20427622]  # Mississippi River outlet\n",
+    "# id_list = 20427622  # Random small interior outlet somewhere in Arizona; see https://github.com/AlabamaWaterInstitute/data_access_examples/blob/main/nwm_network/route_link_fsspec.ipynb\n",
+    "# id_list = outlets_sorted\n",
+    "ds_list = []\n",
+    "df_list = []\n",
+    "tot_list = []\n",
+    "for _i, files in enumerate(file_collections[0:8]):\n",
+    "    st = time()\n",
+    "    print(f\"generating jsons for {_i}\", end=\"\\t\")\n",
+    "    tot_list.append(gen_json(files))\n",
+    "    print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    print(f\"creating xarray dataset for {_i}\", end=\"\\t\")\n",
+    "    backend_args_1 = {\n",
+    "        \"consolidated\": False,\n",
+    "        \"storage_options\": {\n",
+    "            \"fo\": tot_list[_i],\n",
+    "            \"remote_protocol\": \"gcs\",\n",
+    "            \"remote_options\": {\"anon\": True},\n",
+    "        },\n",
+    "    }\n",
+    "    ds_1 = xr.open_dataset(\"reference://\", engine=\"zarr\", backend_kwargs=backend_args_1)\n",
+    "    print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    print(f\"slicing dataset to feature for {_i}\", end=\"\\t\")\n",
+    "    ds_select_1 = ds_1.sel(feature_id=id_list)\n",
+    "    ds_list.append(ds_select_1)\n",
+    "    print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    print(f\"querying/retrieving data and creating dataframe for {_i}\", end=\"\\t\")\n",
+    "    df_select_1 = ds_select_1[\"streamflow\"].to_dataframe()\n",
+    "    df_list.append(df_select_1)\n",
+    "    print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    # print(f\"selecting feature for {_i}\", end=\"\\t\")\n",
+    "    # ds_select_1.plot.scatter(\"time\",\"streamflow\")\n",
+    "    # print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    print(f\"finishing {_i}\", end=\"\\t\")\n",
+    "    print(f\"{time()-st} total time elapsed\")\n",
+    "    print(f\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02c03af6-042c-4de6-84af-645af2dc3f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "# id_list = 101\n",
+    "id_list = [22811611]  # Mississippi River outlet\n",
+    "# id_list = [22811611, 20427622]  # Mississippi River outlet\n",
+    "# id_list = 20427622  # Random small interior outlet somewhere in Arizona; see https://github.com/AlabamaWaterInstitute/data_access_examples/blob/main/nwm_network/route_link_fsspec.ipynb\n",
+    "ds_list = []\n",
+    "df_list = []\n",
+    "tot_list = []\n",
+    "for _i, files in enumerate(file_collections[0:8]):\n",
+    "    st = time()\n",
+    "    print(f\"generating jsons for {_i}\", end=\"\\t\")\n",
+    "    tot_list.append(gen_json(files))\n",
+    "    print(f\"{time()-st} elapsed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e05dbc5e-59e7-4801-aef5-4481aee994fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "id_list = outlets_sorted\n",
+    "id_list = [22811611]  # Mississippi River outlet\n",
+    "\n",
+    "for _i, files in enumerate(file_collections[0:8]):\n",
+    "    st = time()\n",
+    "    print(f\"creating xarray dataset for {_i}\", end=\"\\t\")\n",
+    "    backend_args_1 = {\n",
+    "        \"consolidated\": False,\n",
+    "        \"storage_options\": {\n",
+    "            \"fo\": tot_list[_i],\n",
+    "            \"remote_protocol\": \"gcs\",\n",
+    "            \"remote_options\": {\"anon\": True},\n",
+    "        },\n",
+    "    }\n",
+    "    ds_1 = xr.open_dataset(\"reference://\", engine=\"zarr\", backend_kwargs=backend_args_1)\n",
+    "    print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    print(f\"slicing dataset to feature for {_i}\", end=\"\\t\")\n",
+    "    ds_select_1 = ds_1.sel(feature_id=id_list)\n",
+    "    ds_list.append(ds_select_1)\n",
+    "    print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    print(f\"querying/retrieving data and creating dataframe for {_i}\", end=\"\\t\")\n",
+    "    df_select_1 = ds_select_1[\"streamflow\"].to_dataframe()\n",
+    "    df_list.append(df_select_1)\n",
+    "    print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    # print(f\"selecting feature for {_i}\", end=\"\\t\")\n",
+    "    # ds_select_1.plot.scatter(\"time\",\"streamflow\")\n",
+    "    # print(f\"{time()-st} elapsed\")\n",
+    "\n",
+    "    print(f\"finishing {_i}\", end=\"\\t\")\n",
+    "    print(f\"{time()-st} total time elapsed\")\n",
+    "    print(f\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40d432ca-97eb-4509-9ede-d4cdb69c3e3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_list[1].xs(22811611, axis=0, level=2, drop_level=False)\n",
+    "# for more help, see https://stackoverflow.com/questions/53927460/select-rows-in-pandas-multiindex-dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e149afcf-1530-4489-9d6f-5b2361202538",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}