diff --git a/machine-learning-notebooks/DGA_Detection_ManagedIdentity.ipynb b/machine-learning-notebooks/DGA_Detection_ManagedIdentity.ipynb new file mode 100644 index 00000000..52b24bf7 --- /dev/null +++ b/machine-learning-notebooks/DGA_Detection_ManagedIdentity.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Guided Hunting - Domain Generation Algorithm (DGA) Detection\n", + "
\n", + " Details...\n", + "**Python Version:** Python 3.8 (including Python 3.8 - AzureML)
\n", + "**Required Packages**: msticpy, pandas, numpy, matplotlib, plotly, ipywidgets, ipython, sklearn
\n", + "\n", + "**Data Sources Required**:\n", + "- Log Analytics - DeviceNetworkEvents\n", + "\n", + "
\n", + "\n", + "Brings together a series of queries and visualizations to help you investigate anomalous processes in your network. There are then guided hunting steps to investigate these occurences in further dept. This notebook authenticates with environment variables and requires the following:\n", + "- msticpyconfig.yaml has been properly configured\n", + "- managed identity with appropriate RBAC" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Log in with Managed Identity\n", + "Replace the [CLIENT_ID] with the client id of your Managed Identity. This can be found on the Azure Portal at Managed Identities -> Overview" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "!az login --identity --username [CLIENT_ID]" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Import Libraries" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import msticpy\n", + "import msticpy as mp\n", + "from azure.identity import DefaultAzureCredential, ManagedIdentityCredential\n", + "from azure.keyvault.secrets import SecretClient\n", + "from azure.mgmt.resource import ResourceManagementClient\n", + "\n", + "\n", + "# Initialize ManagedIdentity\n", + "credential = ManagedIdentityCredential()\n", + "\n", + "\n", + "# Now you can use ManagedIdentity or other credential classes\n", + "print(credential)\n" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1743622407389 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Setup msticpyconfig.yaml\n", + "Ensure your msticpyconfig.yaml has been set up and saved in the current directory you are running this notebook." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "import msticpy\n", + "from msticpy.config import MpConfigFile, MpConfigEdit\n", + "import os\n", + "import json\n", + "from pathlib import Path\n", + "\n", + "mp_conf = \"msticpyconfig.yaml\"\n", + "\n", + "# check if MSTICPYCONFIG is already an env variable\n", + "mp_env = os.environ.get(\"MSTICPYCONFIG\")\n", + "mp_conf = mp_env if mp_env and Path(mp_env).is_file() else mp_conf\n", + "\n", + "if not Path(mp_conf).is_file():\n", + " print(\n", + " \"No msticpyconfig.yaml was found!\",\n", + " \"Please check that there is a config.json file in your workspace folder.\",\n", + " \"If this is not there, go back to the Microsoft Sentinel portal and launch\",\n", + " \"this notebook from there.\",\n", + " sep=\"\\n\"\n", + " )\n", + "else:\n", + " mpedit = MpConfigEdit(mp_conf)\n", + " mpconfig = MpConfigFile(mp_conf)\n", + " \n", + " # Convert SettingsDict to a regular dictionary\n", + " settings_dict = {k: v for k, v in mpconfig.settings.items()}\n", + " print(f\"Configured Sentinel workspaces: {json.dumps(settings_dict, indent=4)}\")\n", + "\n", + "msticpy.settings.refresh_config()\n" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "gather": { + "logged": 1743622411835 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Setup QueryProvider" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "# Refresh any config items that might have been saved\n", + "# to the msticpyconfig in the previous steps.\n", + "msticpy.settings.refresh_config()\n", + "\n", + "# Initialize a QueryProvider for Microsoft Sentinel\n", + "qry_prov = mp.QueryProvider(\"AzureSentinel\")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "gather": { + "logged": 1743622415440 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Connect to Sentinel\n", + "You should see \"connected\" output after running this code block. Once you are connected, you can continue on with the notebook." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "# Get the default Microsoft Sentinel workspace details from msticpyconfig.yaml\n", + "\n", + "ws_config = mp.WorkspaceConfig()\n", + "\n", + "# Connect to Microsoft Sentinel with our QueryProvider and config details\n", + "qry_prov.connect(ws_config, mp_az_auth=[\"msi\"])" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "gather": { + "logged": 1743622418323 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## DGA Model Creation\n", + "Make sure \"domain.csv\" is saved in your current working directory. Change the \"model_filename\" to the appropriate path in your environment." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import make_pipeline\n", + "import joblib\n", + "import os\n", + "\n", + "# Load the CSV file containing the labeled domains\n", + "labeled_domains_df = pd.read_csv('/home/azureuser/cloudfiles/code/Users/jgraff1/domain.csv')\n", + "\n", + "# Preprocess the data\n", + "X = labeled_domains_df['Domain']\n", + "y = labeled_domains_df['Label'].apply(lambda x: 1 if x == 'DGA' else 0)\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)\n", + "\n", + "# Create a pipeline that combines the CountVectorizer and the MultinomialNB classifier\n", + "model = make_pipeline(CountVectorizer(), MultinomialNB())\n", + "\n", + "# Train the model\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Save the trained model to a file\n", + "model_filename = '/home/azureuser/cloudfiles/code/Users/jgraff1/Models/dga_model.joblib'\n", + "joblib.dump(model, model_filename)\n", + "print(f'Model saved to {model_filename}')\n", + "\n", + "# Evaluate the model (optional)\n", + "accuracy = model.score(X_test, y_test)\n", + "print(f'Model accuracy: {accuracy:.2f}')" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1743622443562 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Apply dga_model.joblib to Sentinel Data" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "query = \"\"\"\n", + "DeviceNetworkEvents\n", + "| where TimeGenerated < ago(30d)\n", + "| where ActionType == \"DnsConnectionInspected\"\n", + "| extend QueryField = tostring(parse_json(AdditionalFields).query)\n", + "| where isnotempty(QueryField)\n", + "| where QueryField matches regex @\"[a-zA-Z0-9]{8,}\"\n", + "| summarize Count = count() by QueryField\n", + "| where Count > 10\n", + "\"\"\"\n", + "\n", + "# Set the maximum column width to None (no truncation)\n", + "pd.set_option('display.max_colwidth', None)\n", + "df = qry_prov.exec_query(query)\n", + "\n", + "# Load the trained model from the file\n", + "model = joblib.load(model_filename)\n", + "print(f'Model loaded from {model_filename}')\n", + "\n", + "# Define a function to check if a domain is associated with a DGA using the trained model\n", + "def is_dga(domain):\n", + " return model.predict([domain])[0] == 1\n", + "\n", + "# Apply the function to the \"QueryField\" column\n", + "df['IsDGA'] = df['QueryField'].apply(is_dga)\n", + "\n", + "# Display the updated dataframe\n", + "df.head(20)" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1743622473380 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Output All Results to CSV\n", + "Change the \"output_path\" variable to match your environment." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "# Ensure the directory exists\n", + "output_path = '/home/azureuser/cloudfiles/code/Users/jgraff1/dgaresults.csv'\n", + "os.makedirs(os.path.dirname(output_path), exist_ok=True)\n", + "\n", + "# Export the DataFrame to a CSV file in the specified file path\n", + "df.to_csv(output_path, index=False)\n", + "\n", + "print(f\"DataFrame has been exported to {output_path}\")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1743450692896 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Filter DGA Results to CSV\n", + "Any results that match the DGA detection algorithm will be saved to a csv. Change the \"output_path\" to your environment\n" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "# Assuming df is your DataFrame\n", + "# Filter the DataFrame to only include rows where isDGA is \"true\"\n", + "filtered_df = df[df['IsDGA'] == True]\n", + "\n", + "# Ensure the directory exists\n", + "output_path = '/home/azureuser/cloudfiles/code/Users/jgraff1/dgaresults2.csv'\n", + "os.makedirs(os.path.dirname(output_path), exist_ok=True)\n", + "\n", + "# Export the filtered DataFrame to a CSV file in the specified file path\n", + "filtered_df.to_csv(output_path, index=False)\n", + "\n", + "print(f\"Filtered DataFrame has been exported to {output_path}\")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1743450695377 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "###" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + } + ], + "metadata": { + "kernelspec": { + "name": "python38-azureml", + "language": "python", + "display_name": "Python 3.8 - AzureML" + }, + "language_info": { + "name": "python", + "version": "3.10.11", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "microsoft": { + "ms_spell_check": { + "ms_spell_check_language": "en" + }, + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + } + }, + "kernel_info": { + "name": "python38-azureml" + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "version_major": 2, + "version_minor": 0, + "state": { + "bd9a68719d5d4769a0172dafce29c3ed": { + "model_name": "LabelModel", + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "state": { + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_name": "LabelModel", + "_model_module": "@jupyter-widgets/controls", + "tooltip": null, + "description_allow_html": false, + "_view_name": "LabelView", + "tabbable": null, + "_view_module": "@jupyter-widgets/controls", + "_dom_classes": [], + "layout": "IPY_MODEL_bbe69074cc034c4cbe0159f7aa02e651", + "value": "Loading. Please wait....", + "style": "IPY_MODEL_ea046babc5d14729acc2994b9ef15916", + "placeholder": "​", + "_view_count": null, + "_model_module_version": "1.5.0", + "disabled": false, + "description": "" + } + }, + "bbe69074cc034c4cbe0159f7aa02e651": { + "model_name": "LayoutModel", + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "state": { + "_view_module_version": "1.2.0", + "_model_name": "LayoutModel", + "grid_row": null, + "_model_module": "@jupyter-widgets/base", + "overflow": null, + "max_height": null, + "display": null, + "border_top": null, + "grid_auto_flow": null, + "grid_template_rows": null, + "align_self": null, + "grid_auto_columns": null, + "width": null, + "grid_area": null, + "align_items": null, + "_view_name": "LayoutView", + "left": null, + "height": null, + "_view_module": "@jupyter-widgets/base", + "border_right": null, + "object_position": null, + "justify_content": null, + "bottom": null, + "max_width": null, + "border": null, + "margin": null, + "order": null, + "grid_column": null, + "grid_auto_rows": null, + "padding": null, + "grid_template_columns": null, + "justify_items": null, + "object_fit": null, + "visibility": "hidden", + "_view_count": null, + "flex_flow": null, + "min_height": null, + "top": null, + "min_width": null, + "flex": null, + "border_left": null, + "_model_module_version": "1.2.0", + "grid_template_areas": null, + "overflow_x": null, + "right": null, + "overflow_y": null, + "grid_gap": null, + "border_bottom": null, + "align_content": null + } + }, + "ea046babc5d14729acc2994b9ef15916": { + "model_name": "DescriptionStyleModel", + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_name": "DescriptionStyleModel", + "_model_module_version": "1.5.0", + "_view_module": "@jupyter-widgets/base", + "_view_name": "StyleView", + "_view_module_version": "1.2.0", + "_view_count": null, + "description_width": "" + } + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file