fivetran · fivetran-surabhisingh · Oct 31, 2025 · Copilot · Nov 20, 2025 · Copilot
@@ -0,0 +1,57 @@
+"""
+Redshift Multithreading connector for Fivetran Connector SDK.
+Demonstrates threaded extraction from Amazon Redshift.
+"""
+
+import threading
+import psycopg2
+from fivetran_connector_sdk import connector, config, state, records, log, schema
-from fivetran_connector_sdk import connector, config, state, records, log, schema
+from fivetran_connector_sdk import connector, config, state, records, schema
-from fivetran_connector_sdk import connector, config, state, records, log, schema
+from fivetran_connector_sdk import connector, config, state, records, schema
+
+CONFIG = config.Config(
+    host=config.StringField(),
+    port=config.IntegerField(default=5439),
+    database=config.StringField(),
+    user=config.StringField(),
+    password=config.SecretField(),
+    threads=config.IntegerField(default=4)
+)
+
+SCHEMA = schema.Schema(
+    name="redshift_table",
+    columns={
+        "id": schema.StringColumn(),
+        "data": schema.JSONColumn(),
+    }
+)
+
+@connector(
+    name="RedshiftMultithreadingConnector",
+    version="0.1.0",
+    config=CONFIG,
+    schema=SCHEMA,
+)
+def run_connector(ctx: state.Context):
+    def worker(offset):
+        conn = psycopg2.connect(
+            host=ctx.config.host,
+            dbname=ctx.config.database,
+            user=ctx.config.user,
+            password=ctx.config.password,
+            port=ctx.config.port,
+        )
+        cur = conn.cursor()
+        cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
-        cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
+        cur.execute("SELECT * FROM some_table LIMIT %s OFFSET %s", (100, offset))
-        cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
+        cur.execute("SELECT * FROM some_table LIMIT %s OFFSET %s", (100, offset))
+        for row in cur.fetchall():
+            records.write("redshift_table", {"id": row[0], "data": row})
+        conn.close()
-        conn = psycopg2.connect(
-            host=ctx.config.host,
-            dbname=ctx.config.database,
-            user=ctx.config.user,
-            password=ctx.config.password,
-            port=ctx.config.port,
-        )
-        cur = conn.cursor()
-        cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
-        for row in cur.fetchall():
-            records.write("redshift_table", {"id": row[0], "data": row})
-        conn.close()
+        # Use context managers to ensure the connection and cursor are always closed, even if an exception occurs.
+        with psycopg2.connect(
+            host=ctx.config.host,
+            dbname=ctx.config.database,
+            user=ctx.config.user,
+            password=ctx.config.password,
+            port=ctx.config.port,
+        ) as conn:
+            with conn.cursor() as cur:
+                cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
+                for row in cur.fetchall():
+                    records.write("redshift_table", {"id": row[0], "data": row})
-        conn = psycopg2.connect(
-            host=ctx.config.host,
-            dbname=ctx.config.database,
-            user=ctx.config.user,
-            password=ctx.config.password,
-            port=ctx.config.port,
-        )
-        cur = conn.cursor()
-        cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
-        for row in cur.fetchall():
-            records.write("redshift_table", {"id": row[0], "data": row})
-        conn.close()
+        # Use context managers to ensure the connection and cursor are always closed, even if an exception occurs.
+        with psycopg2.connect(
+            host=ctx.config.host,
+            dbname=ctx.config.database,
+            user=ctx.config.user,
+            password=ctx.config.password,
+            port=ctx.config.port,
+        ) as conn:
+            with conn.cursor() as cur:
+                cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
+                for row in cur.fetchall():
+                    records.write("redshift_table", {"id": row[0], "data": row})
+
+    threads = []
+    for i in range(ctx.config.threads):
+        t = threading.Thread(target=worker, args=(i * 100,))
+        t.start()
+        threads.append(t)
+
+    for t in threads:
+        t.join()
+
+    return ctx.update_state({"last_sync": "now"})
-import threading
-import psycopg2
-from fivetran_connector_sdk import connector, config, state, records, log, schema
-
-CONFIG = config.Config(
-    host=config.StringField(),
-    port=config.IntegerField(default=5439),
-    database=config.StringField(),
-    user=config.StringField(),
-    password=config.SecretField(),
-    threads=config.IntegerField(default=4)
-)
-
-SCHEMA = schema.Schema(
-    name="redshift_table",
-    columns={
-        "id": schema.StringColumn(),
-        "data": schema.JSONColumn(),
-    }
-)
-
-@connector(
-    name="RedshiftMultithreadingConnector",
-    version="0.1.0",
-    config=CONFIG,
-    schema=SCHEMA,
-)
-def run_connector(ctx: state.Context):
-    def worker(offset):
-        conn = psycopg2.connect(
-            host=ctx.config.host,
-            dbname=ctx.config.database,
-            user=ctx.config.user,
-            password=ctx.config.password,
-            port=ctx.config.port,
-        )
-        cur = conn.cursor()
-        cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
-        for row in cur.fetchall():
-            records.write("redshift_table", {"id": row[0], "data": row})
-        conn.close()
-
-    threads = []
-    for i in range(ctx.config.threads):
-        t = threading.Thread(target=worker, args=(i * 100,))
-        t.start()
-        threads.append(t)
-
-    for t in threads:
-        t.join()
-
-    return ctx.update_state({"last_sync": "now"})
+import threading  # For multithreaded data extraction
+import psycopg2  # For connecting to Amazon Redshift
+import json  # For reading configuration from JSON file
+from fivetran_connector_sdk import Connector  # For connector initialization
+from fivetran_connector_sdk import Logging as log  # For logging
+from fivetran_connector_sdk import Operations as op  # For upsert and checkpoint operations
+
+# Constants for configuration keys and batch size
+__BATCH_SIZE = 100  # Number of records per thread
+__TABLE_NAME = "redshift_table"
+__MAX_THREADS = 16  # Maximum allowed threads
+
+def validate_configuration(configuration: dict):
+    """
+    Validate the configuration dictionary to ensure it contains all required parameters.
+    This function is called at the start of the update method to ensure that the connector has all necessary configuration values.
+    Args:
+        configuration: a dictionary that holds the configuration settings for the connector.
+    Raises:
+        ValueError: if any required configuration parameter is missing or invalid.
+    """
+    required_configs = ["host", "port", "database", "user", "password", "threads"]
+    for key in required_configs:
+        if key not in configuration:
+            raise ValueError(f"Missing required configuration value: {key}")
+    if not isinstance(configuration["port"], int) or not (0 < configuration["port"] < 65536):
+        raise ValueError("Port must be a valid integer between 1 and 65535.")
+    if not isinstance(configuration["threads"], int) or not (1 <= configuration["threads"] <= __MAX_THREADS):
+        raise ValueError(f"Threads must be an integer between 1 and {__MAX_THREADS}.")
+    # Additional validation can be added here as needed
+
+def schema(configuration: dict):
+    """
+    Define the schema function which lets you configure the schema your connector delivers.
+    See the technical reference documentation for more details on the schema function:
+    https://fivetran.com/docs/connectors/connector-sdk/technical-reference#schema
+    Args:
+        configuration: a dictionary that holds the configuration settings for the connector.
+    """
+    return [
+        {
+            "table": __TABLE_NAME,
+            "primary_key": ["id"],
+            "columns": {
+                "id": "STRING",
+                "data": "JSON"
+            }
+        }
+    ]
+
+def update(configuration: dict, state: dict):
+    """
+    Define the update function which lets you configure how your connector fetches data.
+    See the technical reference documentation for more details on the update function:
+    https://fivetran.com/docs/connectors/connector-sdk/technical-reference#update
+    Args:
+        configuration: a dictionary that holds the configuration settings for the connector.
+        state: a dictionary that holds the state of the connector.
+    """
+    log.warning("Example: DATABASE : Redshift Multithreading")
+    validate_configuration(configuration)
+
+    # Use threading to fetch data in parallel from Redshift
+    threads = []
+    thread_errors = []
+    results_lock = threading.Lock()
+
+    def worker(offset):
+        """
+        Worker function to fetch a batch of records from Redshift.
+        Args:
+            offset: The offset for the SQL query LIMIT/OFFSET.
+        """
+        try:
+            conn = psycopg2.connect(
+                host=configuration["host"],
+                dbname=configuration["database"],
+                user=configuration["user"],
+                password=configuration["password"],
+                port=configuration["port"],
+            )
+            cur = conn.cursor()
+            # Fetch a batch of records using LIMIT and OFFSET
+            cur.execute(f"SELECT * FROM some_table LIMIT {__BATCH_SIZE} OFFSET {offset}")
+            rows = cur.fetchall()
+            for row in rows:
+                # The 'upsert' operation is used to insert or update data in the destination table.
+                # The first argument is the name of the destination table.
+                # The second argument is a dictionary containing the record to be upserted.
+                op.upsert(
+                    table=__TABLE_NAME,
+                    data={"id": str(row[0]), "data": row}
+                )
+            conn.close()
+        except Exception as e:
+            with results_lock:
+                thread_errors.append(str(e))
+            log.severe(f"Thread failed with error: {e}")
+
+    num_threads = configuration["threads"]
+    for i in range(num_threads):
+        t = threading.Thread(target=worker, args=(i * __BATCH_SIZE,))
+        t.start()
+        threads.append(t)
+
+    for t in threads:
+        t.join()
+
+    if thread_errors:
+        raise RuntimeError(f"One or more threads failed: {thread_errors}")
+
+    # Save the progress by checkpointing the state. This is important for ensuring that the sync process can resume
+    # from the correct position in case of next sync or interruptions.
+    # Learn more about how and where to checkpoint by reading our best practices documentation
+    # (https://fivetran.com/docs/connectors/connector-sdk/best-practices#largedatasetrecommendation).
+    state["last_sync"] = "now"
+    op.checkpoint(state)
+
+# Create the connector object using the schema and update functions
+connector = Connector(update=update, schema=schema)
+
+# Check if the script is being run as the main module.
+# This is Python's standard entry method allowing your script to be run directly from the command line or IDE 'run' button.
+# This is useful for debugging while you write your code. Note this method is not called by Fivetran when executing your connector in production.
+# Please test using the Fivetran debug command prior to finalizing and deploying your connector.
+if __name__ == "__main__":
+    # Open the configuration.json file and load its contents
+    with open("configuration.json", "r") as f:
+        configuration = json.load(f)
+
+    # Test the connector locally
+    connector.debug()
-import threading
-import psycopg2
-from fivetran_connector_sdk import connector, config, state, records, log, schema
-
-CONFIG = config.Config(
-    host=config.StringField(),
-    port=config.IntegerField(default=5439),
-    database=config.StringField(),
-    user=config.StringField(),
-    password=config.SecretField(),
-    threads=config.IntegerField(default=4)
-)
-
-SCHEMA = schema.Schema(
-    name="redshift_table",
-    columns={
-        "id": schema.StringColumn(),
-        "data": schema.JSONColumn(),
-    }
-)
-
-@connector(
-    name="RedshiftMultithreadingConnector",
-    version="0.1.0",
-    config=CONFIG,
-    schema=SCHEMA,
-)
-def run_connector(ctx: state.Context):
-    def worker(offset):
-        conn = psycopg2.connect(
-            host=ctx.config.host,
-            dbname=ctx.config.database,
-            user=ctx.config.user,
-            password=ctx.config.password,
-            port=ctx.config.port,
-        )
-        cur = conn.cursor()
-        cur.execute(f"SELECT * FROM some_table LIMIT 100 OFFSET {offset}")
-        for row in cur.fetchall():
-            records.write("redshift_table", {"id": row[0], "data": row})
-        conn.close()
-
-    threads = []
-    for i in range(ctx.config.threads):
-        t = threading.Thread(target=worker, args=(i * 100,))
-        t.start()
-        threads.append(t)
-
-    for t in threads:
-        t.join()
-
-    return ctx.update_state({"last_sync": "now"})
+import threading  # For multithreaded data extraction
+import psycopg2  # For connecting to Amazon Redshift
+import json  # For reading configuration from JSON file
+from fivetran_connector_sdk import Connector  # For connector initialization
+from fivetran_connector_sdk import Logging as log  # For logging
+from fivetran_connector_sdk import Operations as op  # For upsert and checkpoint operations
+
+# Constants for configuration keys and batch size
+__BATCH_SIZE = 100  # Number of records per thread
+__TABLE_NAME = "redshift_table"
+__MAX_THREADS = 16  # Maximum allowed threads
+
+def validate_configuration(configuration: dict):
+    """
+    Validate the configuration dictionary to ensure it contains all required parameters.
+    This function is called at the start of the update method to ensure that the connector has all necessary configuration values.
+    Args:
+        configuration: a dictionary that holds the configuration settings for the connector.
+    Raises:
+        ValueError: if any required configuration parameter is missing or invalid.
+    """
+    required_configs = ["host", "port", "database", "user", "password", "threads"]
+    for key in required_configs:
+        if key not in configuration:
+            raise ValueError(f"Missing required configuration value: {key}")
+    if not isinstance(configuration["port"], int) or not (0 < configuration["port"] < 65536):
+        raise ValueError("Port must be a valid integer between 1 and 65535.")
+    if not isinstance(configuration["threads"], int) or not (1 <= configuration["threads"] <= __MAX_THREADS):
+        raise ValueError(f"Threads must be an integer between 1 and {__MAX_THREADS}.")
+    # Additional validation can be added here as needed
+
+def schema(configuration: dict):
+    """
+    Define the schema function which lets you configure the schema your connector delivers.
+    See the technical reference documentation for more details on the schema function:
+    https://fivetran.com/docs/connectors/connector-sdk/technical-reference#schema
+    Args:
+        configuration: a dictionary that holds the configuration settings for the connector.
+    """
+    return [
+        {
+            "table": __TABLE_NAME,
+            "primary_key": ["id"],
+            "columns": {
+                "id": "STRING",
+                "data": "JSON"
+            }
+        }
+    ]
+
+def update(configuration: dict, state: dict):
+    """
+    Define the update function which lets you configure how your connector fetches data.
+    See the technical reference documentation for more details on the update function:
+    https://fivetran.com/docs/connectors/connector-sdk/technical-reference#update
+    Args:
+        configuration: a dictionary that holds the configuration settings for the connector.
+        state: a dictionary that holds the state of the connector.
+    """
+    log.warning("Example: DATABASE : Redshift Multithreading")
+    validate_configuration(configuration)
+
+    # Use threading to fetch data in parallel from Redshift
+    threads = []
+    thread_errors = []
+    results_lock = threading.Lock()
+
+    def worker(offset):
+        """
+        Worker function to fetch a batch of records from Redshift.
+        Args:
+            offset: The offset for the SQL query LIMIT/OFFSET.
+        """
+        try:
+            conn = psycopg2.connect(
+                host=configuration["host"],
+                dbname=configuration["database"],
+                user=configuration["user"],
+                password=configuration["password"],
+                port=configuration["port"],
+            )
+            cur = conn.cursor()
+            # Fetch a batch of records using LIMIT and OFFSET
+            cur.execute(f"SELECT * FROM some_table LIMIT {__BATCH_SIZE} OFFSET {offset}")
+            rows = cur.fetchall()
+            for row in rows:
+                # The 'upsert' operation is used to insert or update data in the destination table.
+                # The first argument is the name of the destination table.
+                # The second argument is a dictionary containing the record to be upserted.
+                op.upsert(
+                    table=__TABLE_NAME,
+                    data={"id": str(row[0]), "data": row}
+                )
+            conn.close()
+        except Exception as e:
+            with results_lock:
+                thread_errors.append(str(e))
+            log.severe(f"Thread failed with error: {e}")
+
+    num_threads = configuration["threads"]
+    for i in range(num_threads):
+        t = threading.Thread(target=worker, args=(i * __BATCH_SIZE,))
+        t.start()
+        threads.append(t)
+
+    for t in threads:
+        t.join()
+
+    if thread_errors:
+        raise RuntimeError(f"One or more threads failed: {thread_errors}")
+
+    # Save the progress by checkpointing the state. This is important for ensuring that the sync process can resume
+    # from the correct position in case of next sync or interruptions.
+    # Learn more about how and where to checkpoint by reading our best practices documentation
+    # (https://fivetran.com/docs/connectors/connector-sdk/best-practices#largedatasetrecommendation).
+    state["last_sync"] = "now"
+    op.checkpoint(state)
+
+# Create the connector object using the schema and update functions
+connector = Connector(update=update, schema=schema)
+
+# Check if the script is being run as the main module.
+# This is Python's standard entry method allowing your script to be run directly from the command line or IDE 'run' button.
+# This is useful for debugging while you write your code. Note this method is not called by Fivetran when executing your connector in production.
+# Please test using the Fivetran debug command prior to finalizing and deploying your connector.
+if __name__ == "__main__":
+    # Open the configuration.json file and load its contents
+    with open("configuration.json", "r") as f:
+        configuration = json.load(f)
+
+    # Test the connector locally
+    connector.debug()