From 2478d5abf041e88601a6846c5e8bd3a3d89bce26 Mon Sep 17 00:00:00 2001 From: Kelvin Nguyen Date: Tue, 30 Sep 2025 14:31:31 -0700 Subject: [PATCH 1/4] Updated scripts to use Service principal instead of PAT --- .gitignore | 4 +++- load_data.py | 15 ++++++++++++--- requirements.txt | Bin 2312 -> 1236 bytes 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 019a2a4..009139d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ __pycache__/ .env venv_1/ -venv_2/ \ No newline at end of file +venv_2/ +.vscode/ +.venv/ \ No newline at end of file diff --git a/load_data.py b/load_data.py index 2dfd959..5128b3e 100644 --- a/load_data.py +++ b/load_data.py @@ -6,6 +6,7 @@ import yaml from pathlib import Path from databricks import sql +from databricks.sdk.core import Config, oauth_service_principal from dotenv import load_dotenv load_dotenv() @@ -95,15 +96,23 @@ def read_metadata(scenario_path): scenario_str = ','.join(map(str, scenario_id_list)) catalog = os.getenv("DBRICKS_CATALOG", "tam") + server_hostname = os.getenv("DATABRICKS_SERVER_HOSTNAME") + + def credential_provider(): + config = Config( + host = f"https://{server_hostname}", + client_id = os.getenv("DATABRICKS_CLIENT_ID"), + client_secret = os.getenv("DATABRICKS_CLIENT_SECRET")) + return oauth_service_principal(config) + def query_to_df(cursor, query): cursor.execute(query) return cursor.fetchall_arrow().to_pandas() with sql.connect( - server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"), + server_hostname=server_hostname, http_path=os.getenv("DATABRICKS_HTTP_PATH"), - access_token=os.getenv("DATABRICKS_TOKEN"), - ) as connection: + credential_provider=credential_provider) as connection: with connection.cursor() as cursor: df1 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.fwy WHERE scenario_id IN ({scenario_str})") df2 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.all_class WHERE scenario_id IN ({scenario_str})") diff --git a/requirements.txt b/requirements.txt index 3303febd4ae88ea09203ae72902571d59f0794e2..c4cea9eeee97ae0e0eac36a13c10f6a902ee90df 100644 GIT binary patch literal 1236 zcmYjR+j8q541DKb%*2AS z<3`rJxn9p#LRq68)AWhVRw!?Mq>Z@vAd$vrr87TnQF*3x=WpDW@8;%xPB~I&l;47P z+GVtXCS+e9=vUS*8Sl>UnP)XEmg}~tdp=D+E8~JzhuloJh|W@zX0&&aBWv+D{Lcc@ zIjyn>nc;-Yq%5V>Jk6Wp%!&P#6Fu7a;p3P55_&dBLiS<`WXrXo&TF%avAPnthy|5^ z&2;;EXjpyYkkm;iB3A8h`||r9r$7!GF)h##tM{wb4ec4Kb#h<{*Qm8C)O$$YiFJ2I z`RMR~fiA+#4*H`yJ|St$7u?Qrur)Kl>ZtD)Q$-@D)vQgY&+e)->24JCB=G~)JMEx3>6oyx! zfd0*Gf0WYs?p=d1Xy^(y9WSH|K4r9d)XURnZ0izy;~8%u7e8o27UaSy>VG?QEW_)W zcGNi@MH4Ip<`WZ!JpiEp@C)9ddz`@6jyS zG@$bI@ZSQI=I>q({6~U^^~SgquvA!C&cU6??`pl1$Q~_*+cP${PCRvdy!DQqU>l9% KZ9?GhGQIyGM0pGV literal 2312 zcmZve&2AG>42AC+iFe=y5-~|z=z;~RV1YnFVu4silQd1!CNpU!X`4Pga6bDwcUsWs zcA~L;{CDizKfmkL=xNg~)jIQZnFe|v(ni0#^g++tG)(Vw-lo^-Rhp(%TFT;9@~z1$ z$=@@=#ugzF^GLqjb>hlr2`?kI;YVb0ytc73ICN z=!lu~61hwFX<-_~&Z$)M@D3BEih8MtZ{>5Kb0<{K0Jh8}*(oCLKeYp+cmaVfP$%pk ztXp*Q^5}JJ-HwXFKQg^P%3;*G#q>#5%k1yM>Hup>{8f*gWIj>d*7A-PwV+Q?Gxzs7 z$L0)iUPk;yHb@ti$F*>p><=7g1Jzmxy;ZID^7XG5mu`g7$7FTCPDYsY(kIqC4V9YpSF!qnVpq9;2*nD zZ&z)s!^~=7DcOym=^W&LYnl&+QfU|VHy-h`m)Kcs1w0W)Whjx!bDfFw*brs@#x^ Date: Wed, 1 Oct 2025 07:28:36 -0700 Subject: [PATCH 2/4] Fixed credentials typo --- load_data.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/load_data.py b/load_data.py index 5128b3e..3c6aab8 100644 --- a/load_data.py +++ b/load_data.py @@ -109,10 +109,9 @@ def query_to_df(cursor, query): cursor.execute(query) return cursor.fetchall_arrow().to_pandas() - with sql.connect( - server_hostname=server_hostname, - http_path=os.getenv("DATABRICKS_HTTP_PATH"), - credential_provider=credential_provider) as connection: + with sql.connect(server_hostname=server_hostname, + http_path=os.getenv("DATABRICKS_HTTP_PATH"), + credentials_provider=credential_provider) as connection: with connection.cursor() as cursor: df1 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.fwy WHERE scenario_id IN ({scenario_str})") df2 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.all_class WHERE scenario_id IN ({scenario_str})") From 74f764ce56307923c549ead5c2eeafacc81b3699 Mon Sep 17 00:00:00 2001 From: Kelvin Nguyen <77218097+kelvinnguyenn@users.noreply.github.com> Date: Wed, 1 Oct 2025 09:35:19 -0700 Subject: [PATCH 3/4] Add or update the Azure App Service build and deployment workflow config --- ...ion_sp_update_kng_validation-tool(dev).yml | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 .github/workflows/validation_sp_update_kng_validation-tool(dev).yml diff --git a/.github/workflows/validation_sp_update_kng_validation-tool(dev).yml b/.github/workflows/validation_sp_update_kng_validation-tool(dev).yml new file mode 100644 index 0000000..7289b55 --- /dev/null +++ b/.github/workflows/validation_sp_update_kng_validation-tool(dev).yml @@ -0,0 +1,71 @@ +# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy +# More GitHub Actions for Azure: https://github.com/Azure/actions +# More info on Python, GitHub Actions, and Azure App Service: https://aka.ms/python-webapps-actions + +name: Build and deploy Python app to Azure Web App - validation-tool + +on: + push: + branches: + - validation_sp_update_kng + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read #This is required for actions/checkout + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python version + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Create and start virtual environment + run: | + python -m venv venv + source venv/bin/activate + + - name: Install dependencies + run: pip install -r requirements.txt + + # Optional: Add step to run tests here (PyTest, Django test suites, etc.) + + - name: Upload artifact for deployment jobs + uses: actions/upload-artifact@v4 + with: + name: python-app + path: | + . + !venv/ + + deploy: + runs-on: ubuntu-latest + needs: build + permissions: + id-token: write #This is required for requesting the JWT + contents: read #This is required for actions/checkout + + steps: + - name: Download artifact from build job + uses: actions/download-artifact@v4 + with: + name: python-app + + - name: Login to Azure + uses: azure/login@v2 + with: + client-id: ${{ secrets.AZUREAPPSERVICE_CLIENTID_728F9D7925DE4182BF6B18B9C5F323F5 }} + tenant-id: ${{ secrets.AZUREAPPSERVICE_TENANTID_1DB700BC4B8B4361AC2EBDCC1686194C }} + subscription-id: ${{ secrets.AZUREAPPSERVICE_SUBSCRIPTIONID_DE30FBD7016B4320BD60B3E657D06741 }} + + - name: 'Deploy to Azure Web App' + uses: azure/webapps-deploy@v3 + id: deploy-to-webapp + with: + app-name: 'validation-tool' + slot-name: 'dev' + \ No newline at end of file From ec9f9131bd3a6533e9479d25057da20d5cf749c8 Mon Sep 17 00:00:00 2001 From: Kelvin Nguyen Date: Wed, 1 Oct 2025 15:35:07 -0700 Subject: [PATCH 4/4] Update query to be lazy instead of calling right away --- load_data.py | 55 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/load_data.py b/load_data.py index 3c6aab8..399ad48 100644 --- a/load_data.py +++ b/load_data.py @@ -90,12 +90,30 @@ def read_metadata(scenario_path): df_route = pd.concat(dfs["df_route"],ignore_index=True) df_scenario = pd.concat(dfs["df_scenario"], ignore_index=True) + # elif ENV == 'Azure': + # raw_ids = os.getenv("AZURE_SCENARIO_LIST", "") + # scenario_id_list = [int(s.strip()) for s in raw_ids.split(',') if s.strip().isdigit()] + # scenario_str = ','.join(map(str, scenario_id_list)) + # catalog = os.getenv("DBRICKS_CATALOG", "tam") + + # server_hostname = os.getenv("DATABRICKS_SERVER_HOSTNAME") + + # def credential_provider(): + # config = Config( + # host = f"https://{server_hostname}", + # client_id = os.getenv("DATABRICKS_CLIENT_ID"), + # client_secret = os.getenv("DATABRICKS_CLIENT_SECRET")) + # return oauth_service_principal(config) + + # def query_to_df(cursor, query): + # cursor.execute(query) + # return cursor.fetchall_arrow().to_pandas() + elif ENV == 'Azure': raw_ids = os.getenv("AZURE_SCENARIO_LIST", "") scenario_id_list = [int(s.strip()) for s in raw_ids.split(',') if s.strip().isdigit()] scenario_str = ','.join(map(str, scenario_id_list)) catalog = os.getenv("DBRICKS_CATALOG", "tam") - server_hostname = os.getenv("DATABRICKS_SERVER_HOSTNAME") def credential_provider(): @@ -105,22 +123,25 @@ def credential_provider(): client_secret = os.getenv("DATABRICKS_CLIENT_SECRET")) return oauth_service_principal(config) - def query_to_df(cursor, query): - cursor.execute(query) - return cursor.fetchall_arrow().to_pandas() - - with sql.connect(server_hostname=server_hostname, - http_path=os.getenv("DATABRICKS_HTTP_PATH"), - credentials_provider=credential_provider) as connection: - with connection.cursor() as cursor: - df1 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.fwy WHERE scenario_id IN ({scenario_str})") - df2 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.all_class WHERE scenario_id IN ({scenario_str})") - df3 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.truck WHERE scenario_id IN ({scenario_str})") - df4 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.board WHERE scenario_id IN ({scenario_str})") - df5 = query_to_df(cursor, f"SELECT * FROM {catalog}.validation.regional_vmt WHERE scenario_id IN ({scenario_str})") - df_link = query_to_df(cursor, f"SELECT scenario_id, ID, Length, geometry FROM {catalog}.abm3.network__emme_hwy_tcad WHERE scenario_id IN ({scenario_str})") - df_route = query_to_df(cursor, f"SELECT scenario_id, route_name, earlyam_hours, evening_hours, transit_route_shape as geometry FROM {catalog}.abm3.network__transit_route WHERE scenario_id IN ({scenario_str})") - df_scenario = query_to_df(cursor, f"SELECT scenario_id, scenario_name, scenario_yr FROM {catalog}.abm3.main__scenario WHERE scenario_id IN ({scenario_str})") + def query_to_df(query): + """Execute query lazily - connects only when called""" + with sql.connect( + server_hostname=server_hostname, + http_path=os.getenv("DATABRICKS_HTTP_PATH"), + credentials_provider=credential_provider + ) as connection: + with connection.cursor() as cursor: + cursor.execute(query) + return cursor.fetchall_arrow().to_pandas() + + df1 = query_to_df(f"SELECT * FROM {catalog}.validation.fwy WHERE scenario_id IN ({scenario_str})") + df2 = query_to_df(f"SELECT * FROM {catalog}.validation.all_class WHERE scenario_id IN ({scenario_str})") + df3 = query_to_df(f"SELECT * FROM {catalog}.validation.truck WHERE scenario_id IN ({scenario_str})") + df4 = query_to_df(f"SELECT * FROM {catalog}.validation.board WHERE scenario_id IN ({scenario_str})") + df5 = query_to_df(f"SELECT * FROM {catalog}.validation.regional_vmt WHERE scenario_id IN ({scenario_str})") + df_link = query_to_df(f"SELECT scenario_id, ID, Length, geometry FROM {catalog}.abm3.network__emme_hwy_tcad WHERE scenario_id IN ({scenario_str})") + df_route = query_to_df(f"SELECT scenario_id, route_name, earlyam_hours, evening_hours, transit_route_shape as geometry FROM {catalog}.abm3.network__transit_route WHERE scenario_id IN ({scenario_str})") + df_scenario = query_to_df(f"SELECT scenario_id, scenario_name, scenario_yr FROM {catalog}.abm3.main__scenario WHERE scenario_id IN ({scenario_str})") # Clean up data df1 = df1.dropna(subset=['count_day', 'day_flow']).drop(columns=['loader__delta_hash_key','loader__updated_date'], errors='ignore').drop_duplicates()