diff --git a/SETUP.md b/SETUP.md index fb41c2b..6ca8a85 100644 --- a/SETUP.md +++ b/SETUP.md @@ -36,8 +36,17 @@ This solution requires a version of Boto3 => 1.3 } ``` -## Deploy Infrastructure with AWS CLI -This template requires use of an S3 bucket given its size. +## Deploying the CloudFormation Template + +A [CloudFormation template](./cloudformation/sagemaker_studio.yml) deploys resources that are required for this workshop, including databases and a SageMaker Studio domain within which a notebook is setup for use with this workshop. To deploy this template, first clone this repo in an environment (e.g., your laptop or another SageMaker notebook) and then follow the instructions below to deploy the stack. + +If deploying withe AWS CLI then you will need to have the CLI V2 installed (see [here](https://docs.aws.amazon.com/cli/latest/userguide/cliv2-migration-instructions.html)). + +Note that the workshop notebooks must be run within the SageMaker Studio domain that is created by this template. Attempting to run these notebooks in a different environment (such as the one used to deploy the template) will fail due to not having sufficient IAM roles to access databases and other resources. + +### Deploy Infrastructure with AWS CLI +This template requires use of an S3 bucket given its size. For example, to deploy in us-west-2: + ``` aws cloudformation deploy \ --stack-name txt2sql \ @@ -48,7 +57,9 @@ aws cloudformation deploy \ --s3-bucket bucket-to-hold-cfn-template ``` -## Deploy Infrastructure using the Console +When updating the parameters in `cloudformation/parameters/{region}.json` make sure that the DBPassword is at least 8 characters long. + +### Deploy Infrastructure using the Console To deploy this template using the AWS Console only, [follow the instructions here](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-console-create-stack.html) by uploading the template found in the `cloudformation` folder named `sagemaker_studio.yml`. Be sure to update the parameters for template when deploying in console [as described here](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-using-console-create-stack-parameters.html). You will need to update the following: diff --git a/module_3/01_Fine_Tune_Amazon_Titan.ipynb b/module_3/01_Fine_Tune_Amazon_Titan.ipynb index db182fb..d5be64d 100644 --- a/module_3/01_Fine_Tune_Amazon_Titan.ipynb +++ b/module_3/01_Fine_Tune_Amazon_Titan.ipynb @@ -121,7 +121,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "spider_folder = '/home/sagemaker-user/text-to-sql-bedrock-workshop/module_3/spider'" @@ -137,7 +139,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import pandas as pd\n", @@ -159,9 +163,9 @@ "# print the full string, no truncation\n", "pd.set_option(\"display.max_colwidth\", None)\n", "\n", - "# set s3 bucket:\n", - "S3_BUCKET_NAME = \"\" # Can be found in CloudFormation outputs\n", - "FINE_TUNING_JOB_ROLE_ARN = \"\" # can be found in the cloudformation outputs under BedrockFineTuningJobRole" + "# set s3 bucket, this is found in the CloudFormation outputs (the bucket part of AthenaResultsS3Location):\n", + "S3_BUCKET_NAME = \"...\"\n", + "FINE_TUNING_JOB_ROLE_ARN = \"...\" # can be found in the cloudformation outputs under BedrockFineTuningJobRole" ] }, { @@ -175,7 +179,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def read_json_file(file_name):\n", @@ -227,7 +233,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "query_train_spider = construct_queries(\n", @@ -256,7 +264,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def construct_schema(table):\n", @@ -291,7 +301,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def construct_primary_keys(table):\n", @@ -318,7 +330,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def construct_foreign_keys(table):\n", @@ -355,7 +369,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def construct_table_df(tables_path):\n", @@ -391,7 +407,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "\n", @@ -409,7 +427,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "dev_df.head(1)" @@ -440,7 +460,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def template_dataset_titan(\n", @@ -491,7 +513,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "template_dataset_titan(train_df.iloc[0].to_dict(), return_jsonl=False)" @@ -516,7 +540,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# constants\n", @@ -532,7 +558,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def generate_jsonl_file(\n", @@ -613,7 +641,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# create directory for output data sets\n", @@ -667,7 +697,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# # Intermediary save to feather format\n", @@ -690,7 +722,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def test_jsonl_file(savepath):\n", @@ -739,7 +773,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Test the train dataset\n", @@ -769,7 +805,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def get_s3_uri(bucket_name: str, local_path: str, s3_path: str) -> str:\n", @@ -797,7 +835,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "session = boto3.session.Session()\n", @@ -810,7 +850,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "bucket_name = S3_BUCKET_NAME\n", @@ -848,7 +890,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "role_arn = FINE_TUNING_JOB_ROLE_ARN\n", @@ -906,7 +950,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# check if tuning job has finished\n", @@ -931,7 +977,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "provisionedModelName = f\"pvs-{custom_model_name}\"\n", @@ -1653,9 +1701,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "conda_pytorch_p310", "language": "python", - "name": "python3" + "name": "conda_pytorch_p310" }, "language_info": { "codemirror_mode": { @@ -1667,7 +1715,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/module_4/01_prevent_SQL_injection.ipynb b/module_4/01_prevent_SQL_injection.ipynb index 26699ae..373ddbf 100644 --- a/module_4/01_prevent_SQL_injection.ipynb +++ b/module_4/01_prevent_SQL_injection.ipynb @@ -157,8 +157,8 @@ }, "outputs": [], "source": [ - "\n", - "ATHENA_RESULTS_S3_LOCATION = \"\" # available in cloudformation outputs\n", + "# available in cloudformation outputs, bucket name part of AthenaResultsS3Location:\n", + "ATHENA_RESULTS_S3_LOCATION = \"\" \n", "ATHENA_CATALOG_NAME = \"\" # available in cloudformation outputs\n", "DB_NAME = \"tpcds1\"" ] @@ -1433,9 +1433,9 @@ ], "instance_type": "ml.m5.large", "kernelspec": { - "display_name": "Python 3 (Data Science 3.0)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/sagemaker-data-science-310-v1" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -1447,7 +1447,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.14" } }, "nbformat": 4,