From 0790dc7136e06a0781d03c4ef1fefcd957cfd097 Mon Sep 17 00:00:00 2001 From: James Cha-Earley Date: Fri, 20 Mar 2026 15:41:12 -0700 Subject: [PATCH 1/2] Add 3 Snowflake cursor rules for data engineering, Cortex AI, and Snowpark/dbt Add Snowflake-specific cursor rules covering: - Data Engineering: SQL best practices, data pipelines (Dynamic Tables, Streams, Tasks, Snowpipe), semi-structured data, Snowflake Postgres, cost optimization - Cortex AI: AI Functions (AI_COMPLETE, AI_CLASSIFY, AI_EXTRACT, etc.) and Cortex Search for hybrid vector+keyword search and RAG applications - Snowpark Python & dbt: server-side DataFrames, UDFs, UDTFs, stored procedures, and dbt-snowflake adapter with dynamic table materialization All rules grounded in official Snowflake documentation. --- README.md | 3 + .../.cursorrules | 134 ++++++++++++++ .../README.md | 18 ++ .../.cursorrules | 151 ++++++++++++++++ .../README.md | 16 ++ .../.cursorrules | 163 ++++++++++++++++++ .../README.md | 16 ++ 7 files changed, 501 insertions(+) create mode 100644 rules/snowflake-cortex-ai-cursorrules-prompt-file/.cursorrules create mode 100644 rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md create mode 100644 rules/snowflake-data-engineering-cursorrules-prompt-file/.cursorrules create mode 100644 rules/snowflake-data-engineering-cursorrules-prompt-file/README.md create mode 100644 rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules create mode 100644 rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md diff --git a/README.md b/README.md index 63c6a2ff..3415edfd 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,9 @@ By creating a `.cursorrules` file in your project's root directory, you can leve - [GraphQL (Apollo Client)](./rules/react-graphql-apollo-client-cursorrules-prompt-file/.cursorrules) - Cursor rules for GraphQL development with Apollo Client integration. - [TypeScript (Axios)](./rules/typescript-axios-cursorrules-prompt-file/.cursorrules) - Cursor rules for TypeScript development with Axios integration. +- [Snowflake Data Engineering](./rules/snowflake-data-engineering-cursorrules-prompt-file/.cursorrules) - Cursor rules for Snowflake SQL, data pipelines (Dynamic Tables, Streams, Tasks, Snowpipe), semi-structured data, Snowflake Postgres, and cost optimization. +- [Snowflake Cortex AI](./rules/snowflake-cortex-ai-cursorrules-prompt-file/.cursorrules) - Cursor rules for Snowflake Cortex AI Functions (AI_COMPLETE, AI_CLASSIFY, AI_EXTRACT, etc.) and Cortex Search for RAG applications. +- [Snowflake Snowpark Python & dbt](./rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules) - Cursor rules for Snowpark Python (DataFrames, UDFs, stored procedures) and dbt with the Snowflake adapter. ### Testing diff --git a/rules/snowflake-cortex-ai-cursorrules-prompt-file/.cursorrules b/rules/snowflake-cortex-ai-cursorrules-prompt-file/.cursorrules new file mode 100644 index 00000000..c271b810 --- /dev/null +++ b/rules/snowflake-cortex-ai-cursorrules-prompt-file/.cursorrules @@ -0,0 +1,134 @@ +// Snowflake Cortex AI +// Expert guidance for Cortex AI Functions and Cortex Search (hybrid vector+keyword search) + +You are an expert in Snowflake Cortex — the AI layer of Snowflake including Cortex AI Functions (SQL-callable LLM/ML functions) and Cortex Search (managed hybrid search for RAG applications). All processing runs inside Snowflake with no data leaving the platform. + +// ═══════════════════════════════════════════ +// CORTEX AI FUNCTIONS +// ═══════════════════════════════════════════ + +// Available Functions (use these names — they are the current versions): +// AI_COMPLETE — General-purpose LLM completion (text, images, documents). +// AI_CLASSIFY — Classify text/images into user-defined categories (multi-label supported). +// AI_FILTER — Returns TRUE/FALSE for text/image input. Use in WHERE clauses. +// AI_AGG — Aggregate insights across rows of text (no context window limit). +// AI_EMBED — Generate embedding vectors (similarity search, clustering). +// AI_EXTRACT — Extract structured info from text, images, or documents. +// AI_SENTIMENT — Sentiment score from text (-1 to 1). +// AI_SUMMARIZE_AGG — Summarize across rows (no context window limit). +// AI_SIMILARITY — Embedding similarity between two inputs. +// AI_TRANSCRIBE — Transcribe audio/video from stages. +// AI_PARSE_DOCUMENT — OCR or text+layout extraction from documents in stages. +// AI_REDACT — Redact PII from text. +// AI_TRANSLATE — Translate between supported languages. + +// Helper Functions: +// TO_FILE('@stage', 'filename') — File reference for document processing. +// AI_COUNT_TOKENS(model, text) — Check token count before calling a model. +// PROMPT('template {0}', arg) — Build prompt objects for AI_COMPLETE. +// TRY_COMPLETE — Returns NULL on failure instead of error. + +// AI_COMPLETE — The Primary Function +// Models: claude-4-opus, claude-4-sonnet, claude-sonnet-4-5, claude-opus-4-5, claude-haiku-4-5, +// gemini-3-pro, llama3.1-70b, llama3.1-8b, llama3.3-70b, mistral-large2, mistral-small2, deepseek-r1 + +// Text completion: +SELECT AI_COMPLETE(MODEL => 'claude-4-sonnet', PROMPT => 'Summarize: ' || review_text) FROM reviews; + +// Document processing: +SELECT AI_COMPLETE( + MODEL => 'claude-4-sonnet', + PROMPT => PROMPT('Extract the invoice total from {0}', TO_FILE('@docs', 'invoice.pdf')) +); + +// Structured JSON output: +SELECT AI_COMPLETE(MODEL => 'claude-4-sonnet', + PROMPT => 'Extract name, email, company as JSON: ' || raw_text)::VARIANT AS extracted FROM contacts; + +// AI_CLASSIFY: +SELECT AI_CLASSIFY(ticket_text, ['billing', 'technical', 'account', 'other']) AS category FROM tickets; +// Multi-label: AI_CLASSIFY(input, categories, {'output_mode': 'multi'}) + +// AI_FILTER (natural-language WHERE): +SELECT * FROM reviews WHERE AI_FILTER(review_text, 'mentions product quality issues'); + +// AI_AGG (cross-row aggregation): +SELECT AI_AGG(feedback_text, 'What are the top 3 themes?') FROM customer_feedback; + +// AI_EXTRACT (entity extraction): +SELECT AI_EXTRACT(email_body, 'meeting date', 'attendees', 'action items') FROM emails; + +// AI_SENTIMENT: SELECT review_text, AI_SENTIMENT(review_text) AS sentiment FROM product_reviews; +// AI_EMBED: SELECT AI_EMBED(description) AS embedding FROM products; +// AI_PARSE_DOCUMENT: SELECT AI_PARSE_DOCUMENT(TO_FILE('@docs', 'contract.pdf'), MODE => 'LAYOUT'); +// AI_TRANSCRIBE: SELECT AI_TRANSCRIBE(TO_FILE('@media', 'recording.mp3')) AS transcript; +// AI_REDACT: SELECT AI_REDACT(customer_notes) AS redacted FROM support_cases; + +// Privileges: USE AI FUNCTIONS account privilege + SNOWFLAKE.CORTEX_USER database role (both granted to PUBLIC by default). + +// ═══════════════════════════════════════════ +// CORTEX SEARCH — Hybrid Vector + Keyword Search +// ═══════════════════════════════════════════ + +// Fully managed search combining vector (semantic) and keyword (lexical) search. +// Use cases: RAG for LLM chatbots, enterprise search, AI-powered Q&A. + +// Single-index (simplest): +CREATE OR REPLACE CORTEX SEARCH SERVICE my_search + ON transcript_text + ATTRIBUTES region, agent_id + WAREHOUSE = my_wh + TARGET_LAG = '1 day' + EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0' + AS (SELECT transcript_text, region, agent_id FROM support_transcripts); + +// Multi-index (text + vector on multiple columns): +CREATE OR REPLACE CORTEX SEARCH SERVICE my_multi_search + TEXT INDEXES transcript_text, summary + VECTOR INDEXES transcript_text (model='snowflake-arctic-embed-l-v2.0') + ATTRIBUTES region + WAREHOUSE = my_wh + TARGET_LAG = '1 hour' + AS (SELECT transcript_text, summary, region FROM support_transcripts); + +// Key Parameters: ON (single-index column), TEXT INDEXES, VECTOR INDEXES, ATTRIBUTES (filter columns), +// TARGET_LAG (freshness), EMBEDDING_MODEL, PRIMARY KEY (optimized incremental refresh). + +// Query — Python API (recommended for apps): +from snowflake.core import Root +root = Root(session) +service = root.databases["db"].schemas["schema"].cortex_search_services["my_search"] +resp = service.search( + query="internet connection issues", + columns=["transcript_text", "region"], + filter={"@eq": {"region": "North America"}}, + limit=5 +) + +// Query — REST API: +// POST /api/v2/databases//schemas//cortex-search-services/:query +// Body: {"query": "...", "columns": [...], "filter": {...}, "limit": N} + +// Filter syntax: +// {"@eq": {"region": "NA"}}, {"@contains": {"tags": "urgent"}}, {"@gte": {"score": 0.8}} +// {"@and": [f1, f2]}, {"@or": [f1, f2]}, {"@not": f} + +// Scoring config — adjust text vs vector vs reranker weights: +resp = service.search(query="billing dispute", columns=["transcript_text"], + scoring_config={"weights": {"texts": 0.3, "vectors": 0.5, "reranker": 0.2}}, limit=10) + +// RAG Pattern: 1) Search for context, 2) Pass to AI_COMPLETE: +// results = service.search(query=question, columns=["content"], limit=5) +// SELECT AI_COMPLETE(MODEL=>'claude-4-sonnet', PROMPT=>'Answer from context: '||context||' Q: '||question); + +// Best Practices +- Use AI_CLASSIFY for classification (cheaper than AI_COMPLETE). +- Check token counts with AI_COUNT_TOKENS before large batch jobs. +- Set PRIMARY KEY on Cortex Search for optimized incremental refresh. +- Use ATTRIBUTES for filterable columns. Use SEARCH_PREVIEW for testing, Python/REST for production. +- Use dedicated warehouse (no larger than MEDIUM) per search service. + +// Anti-Patterns +- Do NOT use old function names (COMPLETE, CLASSIFY_TEXT, etc.) — use AI_* versions. +- Do NOT pass entire tables through AI_COMPLETE row-by-row without cost estimation. +- Do NOT hardcode model names without considering regional availability. diff --git a/rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md b/rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md new file mode 100644 index 00000000..07cb1b4f --- /dev/null +++ b/rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md @@ -0,0 +1,18 @@ +# Snowflake Cortex AI Cursor Rules + +Rules for Snowflake Cortex — AI Functions (AI_COMPLETE, AI_CLASSIFY, AI_EXTRACT, AI_EMBED, and more) and Cortex Search (managed hybrid vector+keyword search for RAG applications). + +## Usage + +Copy the `.cursorrules` file to the root of your Snowflake AI project. + +## Rules Summary + +- All 14 Cortex AI Functions with syntax and examples +- AI_COMPLETE for text, image, and document processing +- AI_CLASSIFY, AI_FILTER, AI_AGG, AI_EXTRACT, AI_SENTIMENT +- AI_PARSE_DOCUMENT, AI_TRANSCRIBE, AI_REDACT, AI_TRANSLATE +- Cortex Search: CREATE SERVICE (single-index and multi-index) +- Python, REST, and SQL query APIs with filter syntax +- RAG pattern combining Cortex Search + AI_COMPLETE +- Cost awareness and anti-patterns diff --git a/rules/snowflake-data-engineering-cursorrules-prompt-file/.cursorrules b/rules/snowflake-data-engineering-cursorrules-prompt-file/.cursorrules new file mode 100644 index 00000000..a4e551ca --- /dev/null +++ b/rules/snowflake-data-engineering-cursorrules-prompt-file/.cursorrules @@ -0,0 +1,151 @@ +// Snowflake Data Engineering +// Comprehensive guidance for SQL, data pipelines, and platform best practices on Snowflake + +You are an expert Snowflake data engineer with deep knowledge of the entire platform: SQL, data pipelines (Dynamic Tables, Streams, Tasks, Snowpipe), semi-structured data, Snowflake Postgres, and cost optimization. + +// Architecture +// Snowflake separates storage (columnar micro-partitions), compute (elastic virtual warehouses), and services (metadata, security, optimization). + +// ═══════════════════════════════════════════ +// SQL AND SEMI-STRUCTURED DATA +// ═══════════════════════════════════════════ + +// Use VARIANT, OBJECT, and ARRAY types for JSON, Avro, Parquet, ORC. +// Access nested fields with colon notation: src:customer.name::STRING +// Cast explicitly: src:price::NUMBER(10,2), src:created_at::TIMESTAMP_NTZ +// Flatten arrays: +// SELECT f.value:name::STRING AS name +// FROM my_table, LATERAL FLATTEN(input => src:items) f; +// Flatten semi-structured into relational columns when data contains dates, numbers as strings, or arrays. +// Avoid mixed types in the same VARIANT field — prevents subcolumnarization. +// VARIANT null vs SQL NULL: JSON null stored as string "null". Use STRIP_NULL_VALUES => TRUE on load. + +// SQL Coding Standards +// - snake_case for all identifiers. Avoid quoted identifiers. +// - CTEs over nested subqueries. CREATE OR REPLACE for idempotent DDL. +// - COPY INTO for bulk loading, not INSERT. MERGE for upserts: +// MERGE INTO target t USING source s ON t.id = s.id +// WHEN MATCHED THEN UPDATE SET t.name = s.name +// WHEN NOT MATCHED THEN INSERT (id, name) VALUES (s.id, s.name); + +// Stored Procedures — prefix variables with colon : inside SQL statements: +// CREATE PROCEDURE my_proc(p_id INT) RETURNS STRING LANGUAGE SQL AS +// BEGIN +// LET result STRING; +// SELECT name INTO :result FROM users WHERE id = :p_id; +// RETURN result; +// END; + +// ═══════════════════════════════════════════ +// PERFORMANCE OPTIMIZATION +// ═══════════════════════════════════════════ + +// Cluster keys: for very large tables (multi-TB), on WHERE/JOIN/GROUP BY columns. +// ALTER TABLE large_events CLUSTER BY (event_date, region); +// Search Optimization Service: point lookups on high-cardinality columns, substring/regex. +// ALTER TABLE logs ADD SEARCH OPTIMIZATION ON EQUALITY(sender_ip), SUBSTRING(error_message); +// Materialized Views: pre-compute expensive aggregations (single table only). +// Use RESULT_SCAN(LAST_QUERY_ID()) to reuse results. Query tags for attribution: +// ALTER SESSION SET QUERY_TAG = 'etl_daily_load'; + +// ═══════════════════════════════════════════ +// DATA PIPELINES +// ═══════════════════════════════════════════ + +// Choose Your Approach: +// Dynamic Tables — Declarative. Define the query, Snowflake handles refresh. Best for most pipelines. +// Streams + Tasks — Imperative CDC + scheduling. Best for procedural logic, stored procedure calls. +// Snowpipe — Continuous file loading from S3/GCS/Azure. +// Snowpipe Streaming — Low-latency row-level ingestion via SDK (Java, Python). + +// Dynamic Tables +CREATE OR REPLACE DYNAMIC TABLE cleaned_events + TARGET_LAG = '5 minutes' + WAREHOUSE = transform_wh + AS + SELECT event_id, event_type, user_id, event_data:page::STRING AS page, event_timestamp + FROM raw_events + WHERE event_type IS NOT NULL; + +// Chain for multi-step pipelines: +CREATE OR REPLACE DYNAMIC TABLE user_sessions + TARGET_LAG = '10 minutes' + WAREHOUSE = transform_wh + AS + SELECT user_id, MIN(event_timestamp) AS session_start, MAX(event_timestamp) AS session_end, COUNT(*) AS event_count + FROM cleaned_events GROUP BY user_id; + +// TARGET_LAG: freshness target. REFRESH_MODE: AUTO, FULL, or INCREMENTAL. +// Manage: ALTER DYNAMIC TABLE ... SET TARGET_LAG / REFRESH / SUSPEND / RESUME. + +// Streams (CDC) +CREATE OR REPLACE STREAM raw_events_stream ON TABLE raw_events; +// Columns added: METADATA$ACTION, METADATA$ISUPDATE, METADATA$ROW_ID +// APPEND_ONLY = TRUE for insert-only sources (lower overhead). + +// Tasks (Scheduled/Triggered) +CREATE OR REPLACE TASK process_events + WAREHOUSE = transform_wh + SCHEDULE = 'USING CRON 0 */1 * * * America/Los_Angeles' + WHEN SYSTEM$STREAM_HAS_DATA('raw_events_stream') + AS + INSERT INTO cleaned_events + SELECT event_id, event_type, user_id, event_timestamp + FROM raw_events_stream WHERE event_type IS NOT NULL; + +// Task DAGs: CREATE TASK child_task ... AFTER parent_task ... +// Tasks start SUSPENDED — ALTER TASK ... RESUME to enable. + +// Snowpipe +CREATE OR REPLACE PIPE my_pipe AUTO_INGEST = TRUE AS + COPY INTO raw_events FROM @my_external_stage FILE_FORMAT = (TYPE = 'JSON'); + +// Common Pattern: Snowpipe → Dynamic Table chain (simplest end-to-end pipeline). + +// ═══════════════════════════════════════════ +// TIME TRAVEL AND DATA PROTECTION +// ═══════════════════════════════════════════ + +// Time Travel (default 1 day, up to 90 on Enterprise+): +// SELECT * FROM my_table AT(TIMESTAMP => '2024-01-15 10:00:00'::TIMESTAMP); +// SELECT * FROM my_table BEFORE(STATEMENT => ''); +// UNDROP TABLE/SCHEMA/DATABASE to recover dropped objects. +// Zero-copy cloning: CREATE TABLE clone CLONE source; CREATE SCHEMA dev CLONE prod; + +// ═══════════════════════════════════════════ +// SNOWFLAKE POSTGRES +// ═══════════════════════════════════════════ + +// Managed PostgreSQL (v16/17/18) with full wire compatibility. +// CREATE POSTGRES INSTANCE my_instance COMPUTE_FAMILY='STANDARD_S' STORAGE_SIZE_GB=50; +// Bridge OLTP to analytics via pg_lake extension (Iceberg tables readable from both Postgres and Snowflake). +// FORK for point-in-time recovery. HIGH_AVAILABILITY = TRUE for production. + +// ═══════════════════════════════════════════ +// WAREHOUSE AND COST MANAGEMENT +// ═══════════════════════════════════════════ + +// Size by query complexity, not data volume. Start X-Small, scale up. +// AUTO_SUSPEND = 60, AUTO_RESUME = TRUE. Separate warehouses per workload. +// Multi-cluster for concurrency scaling. Transient tables for staging (no Fail-safe cost). +// Monitor: SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY, WAREHOUSE_METERING_HISTORY. +// Resource Monitors for credit limits. Avoid SELECT * on wide tables. + +// Access Control +// Least-privilege RBAC. Database roles for object grants. +// Masking policies for PII. Row access policies for multi-tenant isolation. +// Functional roles: loader (write raw), transformer (read raw, write analytics), analyst (read analytics). + +// Data Sharing +// CREATE SHARE for zero-copy cross-account sharing. Snowflake Marketplace for exchange. + +// Iceberg Tables +// CREATE ICEBERG TABLE ... CATALOG='SNOWFLAKE' EXTERNAL_VOLUME='vol' BASE_LOCATION='path/'; +// Interoperable with Spark, Flink, Trino. + +// Anti-Patterns +- Do NOT use streams+tasks for simple transformations that dynamic tables can handle. +- Do NOT set TARGET_LAG shorter than needed — directly impacts cost. +- Do NOT forget to RESUME tasks after creation. +- Do NOT use SELECT * on wide tables. Do NOT skip clustering analysis on multi-TB tables. +- Do NOT hardcode database/schema names in reusable code. diff --git a/rules/snowflake-data-engineering-cursorrules-prompt-file/README.md b/rules/snowflake-data-engineering-cursorrules-prompt-file/README.md new file mode 100644 index 00000000..caa3244b --- /dev/null +++ b/rules/snowflake-data-engineering-cursorrules-prompt-file/README.md @@ -0,0 +1,16 @@ +# Snowflake Data Engineering Cursor Rules + +Rules for comprehensive data engineering on Snowflake — SQL best practices, data pipelines (Dynamic Tables, Streams, Tasks, Snowpipe), semi-structured data handling, Snowflake Postgres, Time Travel, and cost optimization. + +## Usage + +Copy the `.cursorrules` file to the root of your Snowflake project. + +## Rules Summary + +- Snowflake SQL coding standards and semi-structured data (VARIANT, FLATTEN, colon notation) +- Performance optimization (cluster keys, search optimization, materialized views) +- Data pipeline architecture: Dynamic Tables, Streams, Tasks, Snowpipe +- Time Travel, zero-copy cloning, data protection +- Snowflake Postgres with pg_lake/Iceberg integration +- Warehouse sizing, RBAC, cost management, data sharing, Iceberg tables diff --git a/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules new file mode 100644 index 00000000..eec8c13d --- /dev/null +++ b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules @@ -0,0 +1,163 @@ +// Snowflake Snowpark Python & dbt +// Expert guidance for Snowpark Python development and dbt with the Snowflake adapter + +You are an expert in Snowpark Python (Snowflake's server-side Python API) and dbt with the dbt-snowflake adapter. You build production-grade data transformation pipelines using both tools. + +// ═══════════════════════════════════════════ +// SNOWPARK PYTHON +// ═══════════════════════════════════════════ + +// Snowpark runs Python server-side in Snowflake warehouses. Data never leaves Snowflake. +// Core abstractions: Session, DataFrame, UDF, UDTF, UDAF, Stored Procedure. + +// Session +from snowflake.snowpark import Session +session = Session.builder.configs({ + "account": "myaccount", "user": "myuser", "password": "mypassword", + "role": "my_role", "warehouse": "my_wh", "database": "my_db", "schema": "my_schema" +}).create() + +// DataFrame API — Lazy evaluation, builds query plan executed on collect()/show(). +df = session.table("customers") +df_filtered = df.filter(df["region"] == "US").select("name", "email", "revenue") +df_agg = df.group_by("region").agg(sum("revenue").alias("total_revenue")) +df_agg.show() + +// Key operations: .filter(), .select(), .group_by().agg(), .join(), .sort(), +// .with_column(), .drop(), .distinct(), .limit(), .union_all(), .flatten(), +// .write.save_as_table() + +// Scalar UDFs +from snowflake.snowpark.functions import udf +@udf(name="normalize_email", replace=True) +def normalize_email(email: str) -> str: + return email.strip().lower() if email else None + +// Vectorized UDFs (10-100x faster for ML inference): +import pandas as pd +@udf(name="predict_score", packages=["scikit-learn", "pandas"], replace=True) +def predict_score(features: pd.Series) -> pd.Series: + import pickle, sys + model = pickle.load(open(sys.path[0] + "/model.pkl", "rb")) + return pd.Series(model.predict(features.values.reshape(-1, 1))) + +// UDTFs (return multiple rows per input): +class Tokenizer: + def process(self, text: str): + for token in text.split(): + yield (token,) + +tokenize = session.udtf.register(Tokenizer, + output_schema=StructType([StructField("token", StringType())]), + input_types=[StringType()], name="tokenize", replace=True) + +// Stored Procedures (server-side multi-step logic): +from snowflake.snowpark.functions import sproc +@sproc(name="daily_etl", replace=True, packages=["snowflake-snowpark-python"]) +def daily_etl(session: Session) -> str: + raw = session.table("raw_events") + cleaned = raw.filter(raw["event_type"].is_not_null()) + cleaned.write.mode("overwrite").save_as_table("cleaned_events") + return f"Processed {cleaned.count()} rows" + +// Third-Party Packages: session.add_packages("pandas", "scikit-learn==1.3.0", "xgboost") +// File Access: session.add_import("@my_stage/model.pkl") for static files. +// pandas on Snowflake (no data movement): +// import modin.pandas as pd; import snowflake.snowpark.modin.plugin +// df = pd.read_snowflake("my_table") + +// ═══════════════════════════════════════════ +// DBT WITH SNOWFLAKE ADAPTER +// ═══════════════════════════════════════════ + +// Install: pip install dbt-snowflake +// profiles.yml: +my_project: + target: dev + outputs: + dev: + type: snowflake + account: myaccount + user: myuser + password: "{{ env_var('SNOWFLAKE_PASSWORD') }}" + role: transformer + database: analytics + warehouse: transforming + schema: public + threads: 4 + +// Materializations: view, table, incremental, ephemeral, dynamic_table + +// Dynamic Tables in dbt: +// {{ config(materialized='dynamic_table', snowflake_warehouse='transforming', target_lag='1 hour') }} +// SELECT customer_id, SUM(amount) AS lifetime_value FROM {{ ref('stg_orders') }} GROUP BY 1 + +// Incremental Models: +{{ + config( + materialized='incremental', + unique_key='event_id', + incremental_strategy='merge', + on_schema_change='sync_all_columns' + ) +}} +SELECT * FROM {{ ref('stg_events') }} +{% if is_incremental() %} + WHERE event_timestamp > (SELECT MAX(event_timestamp) FROM {{ this }}) +{% endif %} + +// Snowflake-Specific Configs: +// cluster_by=['col1', 'col2'] — Clustering (large tables only) +// transient=true — No Fail-safe (lower storage cost) +// query_tag='finance_daily' — Workload attribution +// copy_grants=true — Preserve access on replace +// snowflake_warehouse='lg_wh' — Per-model warehouse override +// secure=true — Secure views + +// Sources (models/staging/_sources.yml): +sources: + - name: raw + database: raw_db + schema: jaffle_shop + tables: + - name: customers + loaded_at_field: _loaded_at + freshness: + warn_after: {count: 12, period: hour} + error_after: {count: 24, period: hour} + +// Testing (schema.yml): +models: + - name: stg_customers + columns: + - name: customer_id + tests: [unique, not_null] + +// Key Commands: +// dbt run, dbt test, dbt build (run+test in order), dbt compile +// dbt run --select my_model+ (model + downstream) +// dbt run --select +my_model (model + upstream) +// dbt source freshness, dbt docs generate && dbt docs serve + +// Custom schema macro (macros/generate_schema_name.sql): +{% macro generate_schema_name(custom_schema_name, node) %} + {% if custom_schema_name %}{{ custom_schema_name | trim }}{% else %}{{ target.schema }}{% endif %} +{% endmacro %} + +// Best Practices +- Prefer vectorized UDFs (pandas) for ML inference — much faster than scalar UDFs. +- Pin package versions in production UDFs and stored procedures. +- Use DataFrame API over raw SQL strings in reusable Python pipelines. +- Use staging models (stg_*) to rename/type-cast, mart models for business tables. +- Use incremental for fact tables; dynamic_table for near-real-time. +- Set on_schema_change='sync_all_columns' on incremental models. +- Use copy_grants=true to avoid permission issues. Tag models for selective execution. +- Use separate warehouses for dbt runs vs analyst queries. + +// Anti-Patterns +- Do NOT collect() large DataFrames to client — process server-side. +- Do NOT use Python loops over rows — use DataFrame operations or vectorized UDFs. +- Do NOT use {{ this }} without {% if is_incremental() %} guard. +- Do NOT set cluster_by on small tables (< 1TB). +- Do NOT use materialized='table' for everything — views are free. +- Do NOT hardcode database/schema — use {{ ref() }} and {{ source() }}. diff --git a/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md new file mode 100644 index 00000000..2eb5fd76 --- /dev/null +++ b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md @@ -0,0 +1,16 @@ +# Snowflake Snowpark Python & dbt Cursor Rules + +Rules for building data pipelines with Snowpark Python (server-side DataFrames, UDFs, stored procedures) and dbt with the Snowflake adapter (dynamic tables, incremental models, Snowflake-specific configs). + +## Usage + +Copy the `.cursorrules` file to the root of your Snowpark or dbt-snowflake project. + +## Rules Summary + +- Snowpark Python: Session, DataFrame API, scalar and vectorized UDFs, UDTFs, stored procedures +- pandas on Snowflake (modin), third-party packages, file access in UDFs +- dbt-snowflake: profiles.yml, all materializations including dynamic_table +- Incremental models with merge strategy and schema evolution +- Snowflake-specific dbt configs (cluster_by, transient, query_tag, copy_grants) +- Sources, testing, macros, and key dbt commands From f38a28c6e20b767a300d6e6749c7070ed43daddc Mon Sep 17 00:00:00 2001 From: James Cha-Earley Date: Mon, 23 Mar 2026 10:23:04 -0700 Subject: [PATCH 2/2] Address CodeRabbit review feedback - Sort README entries alphabetically within Database and API section - Add author attribution (Snowflake DevRel) to all 3 rule README files - Replace hardcoded credentials with env vars in Snowpark session example --- README.md | 4 ++-- rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md | 2 ++ .../README.md | 2 ++ .../.cursorrules | 5 ++++- .../snowflake-snowpark-dbt-cursorrules-prompt-file/README.md | 2 ++ 5 files changed, 12 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3415edfd..45091e08 100644 --- a/README.md +++ b/README.md @@ -185,10 +185,10 @@ By creating a `.cursorrules` file in your project's root directory, you can leve ### Database and API - [GraphQL (Apollo Client)](./rules/react-graphql-apollo-client-cursorrules-prompt-file/.cursorrules) - Cursor rules for GraphQL development with Apollo Client integration. -- [TypeScript (Axios)](./rules/typescript-axios-cursorrules-prompt-file/.cursorrules) - Cursor rules for TypeScript development with Axios integration. -- [Snowflake Data Engineering](./rules/snowflake-data-engineering-cursorrules-prompt-file/.cursorrules) - Cursor rules for Snowflake SQL, data pipelines (Dynamic Tables, Streams, Tasks, Snowpipe), semi-structured data, Snowflake Postgres, and cost optimization. - [Snowflake Cortex AI](./rules/snowflake-cortex-ai-cursorrules-prompt-file/.cursorrules) - Cursor rules for Snowflake Cortex AI Functions (AI_COMPLETE, AI_CLASSIFY, AI_EXTRACT, etc.) and Cortex Search for RAG applications. +- [Snowflake Data Engineering](./rules/snowflake-data-engineering-cursorrules-prompt-file/.cursorrules) - Cursor rules for Snowflake SQL, data pipelines (Dynamic Tables, Streams, Tasks, Snowpipe), semi-structured data, Snowflake Postgres, and cost optimization. - [Snowflake Snowpark Python & dbt](./rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules) - Cursor rules for Snowpark Python (DataFrames, UDFs, stored procedures) and dbt with the Snowflake adapter. +- [TypeScript (Axios)](./rules/typescript-axios-cursorrules-prompt-file/.cursorrules) - Cursor rules for TypeScript development with Axios integration. ### Testing diff --git a/rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md b/rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md index 07cb1b4f..484d75ca 100644 --- a/rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md +++ b/rules/snowflake-cortex-ai-cursorrules-prompt-file/README.md @@ -1,5 +1,7 @@ # Snowflake Cortex AI Cursor Rules +Author: [Snowflake DevRel](https://github.com/Snowflake-Labs) + Rules for Snowflake Cortex — AI Functions (AI_COMPLETE, AI_CLASSIFY, AI_EXTRACT, AI_EMBED, and more) and Cortex Search (managed hybrid vector+keyword search for RAG applications). ## Usage diff --git a/rules/snowflake-data-engineering-cursorrules-prompt-file/README.md b/rules/snowflake-data-engineering-cursorrules-prompt-file/README.md index caa3244b..f7c20559 100644 --- a/rules/snowflake-data-engineering-cursorrules-prompt-file/README.md +++ b/rules/snowflake-data-engineering-cursorrules-prompt-file/README.md @@ -1,5 +1,7 @@ # Snowflake Data Engineering Cursor Rules +Author: [Snowflake DevRel](https://github.com/Snowflake-Labs) + Rules for comprehensive data engineering on Snowflake — SQL best practices, data pipelines (Dynamic Tables, Streams, Tasks, Snowpipe), semi-structured data handling, Snowflake Postgres, Time Travel, and cost optimization. ## Usage diff --git a/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules index eec8c13d..c5cca3c9 100644 --- a/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules +++ b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/.cursorrules @@ -12,8 +12,11 @@ You are an expert in Snowpark Python (Snowflake's server-side Python API) and db // Session from snowflake.snowpark import Session +import os session = Session.builder.configs({ - "account": "myaccount", "user": "myuser", "password": "mypassword", + "account": os.environ["SNOWFLAKE_ACCOUNT"], + "user": os.environ["SNOWFLAKE_USER"], + "password": os.environ["SNOWFLAKE_PASSWORD"], "role": "my_role", "warehouse": "my_wh", "database": "my_db", "schema": "my_schema" }).create() diff --git a/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md index 2eb5fd76..ea609464 100644 --- a/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md +++ b/rules/snowflake-snowpark-dbt-cursorrules-prompt-file/README.md @@ -1,5 +1,7 @@ # Snowflake Snowpark Python & dbt Cursor Rules +Author: [Snowflake DevRel](https://github.com/Snowflake-Labs) + Rules for building data pipelines with Snowpark Python (server-side DataFrames, UDFs, stored procedures) and dbt with the Snowflake adapter (dynamic tables, incremental models, Snowflake-specific configs). ## Usage