diff --git a/samples/ml/ml_jobs/llm_finetune/README.md b/samples/ml/ml_jobs/llm_finetune/README.md index d234bd49..031db928 100644 --- a/samples/ml/ml_jobs/llm_finetune/README.md +++ b/samples/ml/ml_jobs/llm_finetune/README.md @@ -195,6 +195,13 @@ python scripts/run_eval.py \ --schema PUBLIC ``` +The evaluation script: +- Retrieves the trained model checkpoint from the training job's stage +- Generates SOAP notes for each test example using the fine-tuned model +- Uses an LLM-as-judge approach (Qwen3-8B) to compare predictions against ground truth +- Reports pass/fail accuracy for each SOAP section + + ### Step 4: Log, deploy & test model After training and eval, log the model and deploy as a service. This service can be used for rest API as well. @@ -212,12 +219,6 @@ python scripts/run_log_n_deploy_model.py \ Replace `` with the job ID from the training step (e.g., `LLM_DEMO.PUBLIC.ARCTIC_TRAINING_XXXXXXX`). -The evaluation script: -- Retrieves the trained model checkpoint from the training job's stage -- Generates SOAP notes for each test example using the fine-tuned model -- Uses an LLM-as-judge approach (Qwen3-8B) to compare predictions against ground truth -- Reports pass/fail accuracy for each SOAP section - ## Evaluation Notes The evaluation script uses an LLM-as-judge approach to assess the quality of generated SOAP notes. For each test example, the fine-tuned model generates predictions which are then compared against ground truth by a larger judge model (Qwen3-8B). This provides more nuanced evaluation than simple text matching, accounting for valid paraphrasing and semantic equivalence. diff --git a/samples/ml/ml_jobs/llm_finetune/requirements.txt b/samples/ml/ml_jobs/llm_finetune/requirements.txt index 4956351f..4f0f3200 100644 --- a/samples/ml/ml_jobs/llm_finetune/requirements.txt +++ b/samples/ml/ml_jobs/llm_finetune/requirements.txt @@ -1,2 +1,3 @@ snowflake-ml-python>=1.21.0 -datasets \ No newline at end of file +datasets +peft \ No newline at end of file diff --git a/samples/ml/ml_jobs/llm_finetune/scripts/run_log_n_deploy_model.py b/samples/ml/ml_jobs/llm_finetune/scripts/run_log_n_deploy_model.py index 97056c6a..b59e4c96 100644 --- a/samples/ml/ml_jobs/llm_finetune/scripts/run_log_n_deploy_model.py +++ b/samples/ml/ml_jobs/llm_finetune/scripts/run_log_n_deploy_model.py @@ -52,7 +52,7 @@ schema=args.schema, external_access_integrations=args.external_access_integrations, session=session, - pip_requirements=["transformers", "snowflake-ml-python"], + pip_requirements=["transformers", "snowflake-ml-python>=1.26", "peft"], ) print(f"Job submitted with ID: {job.id}") diff --git a/samples/ml/ml_jobs/llm_finetune/src/log_n_deploy_model.py b/samples/ml/ml_jobs/llm_finetune/src/log_n_deploy_model.py index a076471a..da2db6ac 100644 --- a/samples/ml/ml_jobs/llm_finetune/src/log_n_deploy_model.py +++ b/samples/ml/ml_jobs/llm_finetune/src/log_n_deploy_model.py @@ -1,7 +1,8 @@ import argparse import logging from pathlib import Path -import transformers +from peft import PeftModel +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from typing import Optional import pandas as pd @@ -114,23 +115,45 @@ def main(): # --------- 1) Log model ---------- - if args.lora_path: - raise ValueError("LoRA path is not supported yet") - + version_name = "lora" if args.lora_path else "base" model_registry = Registry(session=session) try: - mv = model_registry.get_model(args.model_name).version("v1") + mv = model_registry.get_model(args.model_name).version(version_name) except Exception as e: # Resolve checkpoint paths to latest global_step if applicable - model_path = resolve_path_global_step(resolve_stage_path(session, args.model_name_or_path)) - logger.info(f"Model {args.model_name} not found, logging to model registry") - logger.info(f"Hydrating model from {model_path}") - pipe = transformers.pipeline("text-generation", model=model_path) - logger.info(f"Hydrated model {args.model_name} from {model_path}, logging to model registry") + if args.lora_path: + # 1. Load the base model + logger.info(f"Loading base model from {args.model_name_or_path}") + base_model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + torch_dtype="auto", + device_map="auto" + ) + # 2. Load the LoRA adapter on top of it + lora_path = resolve_path_global_step(resolve_stage_path(session, args.lora_path)) + logger.info(f"Loading LoRA adapter from {lora_path}") + model = PeftModel.from_pretrained(base_model, lora_path) + # 3. Merge LoRA weights into base model and unload adapter + fused_model = model.merge_and_unload() + logger.info(f"Merged LoRA weights into base model") + + # Create pipeline with merged model + tokenizer = AutoTokenizer.from_pretrained(lora_path) + logger.info(f"Creating transformer pipeline with merged model") + pipe = pipeline("text-generation", model=fused_model, tokenizer=tokenizer) + logger.info(f"Created transformer pipeline with merged model") + else: + model_path = resolve_path_global_step(resolve_stage_path(session, args.model_name_or_path)) + logger.info(f"Model {args.model_name} not found, logging to model registry") + logger.info(f"Hydrating model from {model_path}") + pipe = pipeline("text-generation", model=model_path) + logger.info(f"Created transformer pipeline with model") + + logger.info(f"Logging hydrated model {args.model_name} version {version_name} to model registry") mv = model_registry.log_model( pipe, model_name=args.model_name, - version_name="v1", + version_name=version_name, signatures=openai_signatures.OPENAI_CHAT_SIGNATURE) logger.info(f"Model {args.model_name} version {mv.version_name} logged successfully")