diff --git a/docs/operations_/backup_restore/00_overview.md b/docs/operations_/backup_restore/00_overview.md new file mode 100644 index 00000000000..7cd121b8651 --- /dev/null +++ b/docs/operations_/backup_restore/00_overview.md @@ -0,0 +1,305 @@ +--- +description: 'Overview of ClickHouse backup and restore' +sidebar_label: 'Overview' +slug: /operations/backup/overview +title: 'Backup and Restore in ClickHouse' +doc_type: 'reference' +--- + +import GenericSettings from '@site/docs/operations_/backup_restore/_snippets/_generic_settings.md'; +import Syntax from '@site/docs/operations_/backup_restore/_snippets/_syntax.md'; +import AzureSettings from '@site/docs/operations_/backup_restore/_snippets/_azure_settings.md'; +import S3Settings from '@site/docs/operations_/backup_restore/_snippets/_s3_settings.md'; + +> This section broadly covers backups and restores in ClickHouse. For a more +detailed description of each backup method, see the pages for specific methods +in the sidebar. + +## Introduction {#introduction} + +While [replication](/engines/table-engines/mergetree-family/replication) provides protection from hardware failures, it does not +protect against human errors: accidental deletion of data, deletion of the wrong +table or a table on the wrong cluster, and software bugs that result in incorrect +data processing or data corruption. + +In many cases mistakes like these will affect all replicas. ClickHouse has built-in +safeguards to prevent some types of mistakes, for example, by [default](/operations/settings/settings#max_table_size_to_drop) +you can't just drop tables with a `MergeTree` family engine containing more than +50 Gb of data. However, these safeguards do not cover all possible cases and +problems can still occur. + +To effectively mitigate possible human errors, you should carefully prepare a +strategy for backing up and restoring your data **in advance**. + +Each company has different resources available and business requirements, so +there's no universal solution for ClickHouse backups and restores that will fit +every situation. What works for one gigabyte of data likely won't work for tens +of petabytes of data. There are a variety of possible approaches with their own pros +and cons, which are presented in this section of the docs. It is a good idea to +use several approaches instead of just one such as to compensate for their various +shortcomings. + +:::note +Keep in mind that if you backed something up and never tried to restore it, +chances are that the restore will not work properly when you actually need it (or at +least it will take longer than the business can tolerate). So whatever backup +approach you choose, make sure to automate the restore process as well, and practice +it on a spare ClickHouse cluster regularly. +::: + +The following pages detail the various backup and +restore methods available in ClickHouse: + +| Page | Description | +|---------------------------------------------------------------------|-----------------------------------------------------------| +| [Backup/restore using local disk or S3 disk](./01_local_disk.md) | Details backup/restore to or from a local disk or S3 disk | +| [Backup/restore using S3 endpoint](./02_s3_endpoint.md) | Details backup/restore to or from an S3 endpoint | +| [Backup/restore using AzureBlobStorage](./03_azure_blob_storage.md) | Details backup/restore to or from Azure blob storage | +| [Alternative methods](./04_alternative_methods.md) | Discusses alternative backup methods | + +Backups can: +- be [full or incremental](#backup-types) +- be [synchronous or asynchronous](#synchronous-vs-asynchronous) +- be [concurrent or non-concurrent](#concurrent-vs-non-concurrent) +- be [compressed or uncompressed](#compressed-vs-uncompressed) +- use [named collections](#using-named-collections) +- be password protected +- be taken of [system tables, log tables, or access management tables](#system-backups) + +## Backup types {#backup-types} + +Backups can be either full or incremental. Full backups are a complete copy of the +data, while incremental backups are a delta of the data from the last full backup. + +Full backups have the advantage of being a simple, independent (of other backups) +and reliable recovery method. However, they can take a long time to complete and +can consume a lot of space. Incremental backups, on the other hand, are more +efficient in terms of both time and space, but restoring the data requires all +the backups to be available. + +Depending on your needs, you may want to use: +- **Full backups** for smaller databases or critical data. +- **Incremental backups** for larger databases or when backups need to be done frequently and cost effectively. +- **Both**, for instance, weekly full backups and daily incremental backups. + +## Synchronous vs asynchronous backups {#synchronous-vs-asynchronous} + +`BACKUP` and `RESTORE` commands can also be marked `ASYNC`. In this case, the +backup command returns immediately, and the backup process runs in the background. +If the commands are not marked `ASYNC`, the backup process is synchronous and +the command blocks until the backup completes. + +## Concurrent vs non-concurrent backups {#concurrent-vs-non-concurrent} + +By default, ClickHouse allows concurrent backups and restores. This means you +can initiate multiple backup or restore operations simultaneously. However, +there are server-level settings that let you disallow this behavior. If you set +these settings to false, only one backup or restore operation is allowed to run +on a cluster at a time. This can help avoid resource contention or potential +conflicts between operations. + +To disallow concurrent backup/restore, you can use these settings respectively: + +```xml + + + false + false + + +``` + +The default value for both is true, so by default concurrent backup/restores are +allowed. When these settings are false on a cluster, only a single backup/restore +is allowed to run on a cluster at a time. + +## Compressed vs uncompressed backups {#compressed-vs-uncompressed} + +ClickHouse backups support compression through the `compression_method` and `compression_level` settings. + +When creating a backup, you can specify: + +```sql +BACKUP TABLE test.table + TO Disk('backups', 'filename.zip') + SETTINGS compression_method='lzma', compression_level=3 +``` + +## Using named collections {#using-named-collections} + +Named collections allow you to store key-value pairs (like S3 credentials, endpoints, and settings) that can be reused across backup/restore operations. +They help to: + +- Hide credentials from users without admin access +- Simplify commands by storing complex configuration centrally +- Maintain consistency across operations +- Avoid credential exposure in query logs + +See ["named collections"](/operations/named-collections) for further details. + +## Backing up system, log or access management tables {#system-backups} + +System tables can also be included in your backup and restore workflows, but their +inclusion depends on your specific use case. + +System tables that store historic data, such as those with a `_log` suffix (e.g., +`query_log`, `part_log`), can be backed up and restored like any other table. +If your use case relies on analyzing historic data - for example, using `query_log` +to track query performance or debug issues - it's recommended to include these +tables in your backup strategy. However, if historic data from these tables is +not required, they can be excluded to save backup storage space. + +System tables related to access management, such as users, roles, row_policies, +settings_profiles, and quotas, receive special treatment during backup and restore operations. +When these tables are included in a backup, their content is exported to a special +`accessXX.txt` file, which encapsulates the equivalent SQL statements for creating +and configuring the access entities. Upon restoration, the restore process +interprets these files and re-applies the SQL commands to recreate the users, +roles, and other configurations. This feature ensures that the access control +configuration of a ClickHouse cluster can be backed up and restored as part of +the cluster's overall setup. + +This functionality only works for configurations managed through SQL commands +(referred to as ["SQL-driven Access Control and Account Management"](/operations/access-rights#enabling-access-control)). +Access configurations defined in ClickHouse server configuration files (e.g. `users.xml`) +are not included in backups and cannot be restored through this method. + +## General syntax {#syntax} + + + +### Command summary {#command-summary} + +Each of the commands above is detailed below: + +| **Command** | **Description** | +|------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| `BACKUP` | Creates a backup of specified objects | +| `RESTORE` | Restores objects from a backup | +| `[ASYNC]` | Makes the operation run asynchronously (returns immediately with an ID you can monitor) | +| `TABLE [db.]table_name [AS [db.]table_name_in_backup]` | Backs up/restores a specific table (can be renamed) | +| `[PARTITION[S] partition_expr [,...]]` | Only backup/restore specific partitions of the table | +| `DICTIONARY [db.]dictionary_name [AS [db.]name_in_backup]` | Backs up/restores a dictionary object | +| `DATABASE database_name [AS database_name_in_backup]` | Backs up/restores an entire database (can be renamed) | +| `TEMPORARY TABLE table_name [AS table_name_in_backup]` | Backs up/restores a temporary table (can be renamed) | +| `VIEW view_name [AS view_name_in_backup]` | Backs up/restores a view (can be renamed) | +| `[EXCEPT TABLES ...]` | Exclude specific tables when backing up a database | +| `ALL` | Backs up/restores everything (all databases, tables, etc.). Prior to version 23.4 of ClickHouse, `ALL` was only applicable to the `RESTORE` command. | +| `[EXCEPT {TABLES\|DATABASES}...]` | Exclude specific tables or databases when using `ALL` | +| `[ON CLUSTER 'cluster_name']` | Execute the backup/restore across a ClickHouse cluster | +| `TO\|FROM` | Direction: `TO` for backup destination, `FROM` for restore source | +| `File('/')` | Store to/restore from local file system | +| `Disk('', '/')` | Store to/restore from a configured disk | +| `S3('/', '', '')` | Store to/restore from Amazon S3 or S3-compatible storage | +| `[SETTINGS ...]` | See below for complete list of settings | | + +### Settings {#settings} + +**Generic backup/restore settings** + + + +**S3 specific settings** + + + +**Azure specific settings** + + + +## Administration and troubleshooting {#check-the-status-of-backups} + +The backup command returns an `id` and `status`, and that `id` can be used to +get the status of the backup. This is very useful to check the progress of long +`ASYNC` backups. The example below shows a failure that happened when trying to +overwrite an existing backup file: + +```sql +BACKUP TABLE helloworld.my_first_table TO Disk('backups', '1.zip') ASYNC +``` + +```response +┌─id───────────────────────────────────┬─status──────────┐ +│ 7678b0b3-f519-4e6e-811f-5a0781a4eb52 │ CREATING_BACKUP │ +└──────────────────────────────────────┴─────────────────┘ + +1 row in set. Elapsed: 0.001 sec. +``` + +```sql +SELECT +* +FROM system.backups +WHERE id='7678b0b3-f519-4e6e-811f-5a0781a4eb52' +FORMAT Vertical +``` + +```response +Row 1: +────── +id: 7678b0b3-f519-4e6e-811f-5a0781a4eb52 +name: Disk('backups', '1.zip') +#highlight-next-line +status: BACKUP_FAILED +num_files: 0 +uncompressed_size: 0 +compressed_size: 0 +#highlight-next-line +error: Code: 598. DB::Exception: Backup Disk('backups', '1.zip') already exists. (BACKUP_ALREADY_EXISTS) (version 22.8.2.11 (official build)) +start_time: 2022-08-30 09:21:46 +end_time: 2022-08-30 09:21:46 + +1 row in set. Elapsed: 0.002 sec. +``` + +Along with the [`system.backups`](/operations/system-tables/backups) table, all backup and restore operations are also tracked in the system log table +[`system.backup_log`](/operations/system-tables/backup_log): + +```sql +SELECT * +FROM system.backup_log +WHERE id = '7678b0b3-f519-4e6e-811f-5a0781a4eb52' +ORDER BY event_time_microseconds ASC +FORMAT Vertical +``` + +```response +Row 1: +────── +event_date: 2023-08-18 +event_time_microseconds: 2023-08-18 11:13:43.097414 +id: 7678b0b3-f519-4e6e-811f-5a0781a4eb52 +name: Disk('backups', '1.zip') +status: CREATING_BACKUP +error: +start_time: 2023-08-18 11:13:43 +end_time: 1970-01-01 03:00:00 +num_files: 0 +total_size: 0 +num_entries: 0 +uncompressed_size: 0 +compressed_size: 0 +files_read: 0 +bytes_read: 0 + +Row 2: +────── +event_date: 2023-08-18 +event_time_microseconds: 2023-08-18 11:13:43.174782 +id: 7678b0b3-f519-4e6e-811f-5a0781a4eb52 +name: Disk('backups', '1.zip') +status: BACKUP_FAILED +#highlight-next-line +error: Code: 598. DB::Exception: Backup Disk('backups', '1.zip') already exists. (BACKUP_ALREADY_EXISTS) (version 23.8.1.1) +start_time: 2023-08-18 11:13:43 +end_time: 2023-08-18 11:13:43 +num_files: 0 +total_size: 0 +num_entries: 0 +uncompressed_size: 0 +compressed_size: 0 +files_read: 0 +bytes_read: 0 + +2 rows in set. Elapsed: 0.075 sec. +``` \ No newline at end of file diff --git a/docs/operations_/backup_restore/01_local_disk.md b/docs/operations_/backup_restore/01_local_disk.md new file mode 100644 index 00000000000..ca844cf9d7e --- /dev/null +++ b/docs/operations_/backup_restore/01_local_disk.md @@ -0,0 +1,337 @@ +--- +description: 'Details backup/restore to or from a local disk' +sidebar_label: 'Local disk / S3 disk' +slug: /operations/backup/disk +title: 'Backup and Restore in ClickHouse' +doc_type: 'guide' +--- + +import GenericSettings from '@site/docs/operations_/backup_restore/_snippets/_generic_settings.md'; +import S3Settings from '@site/docs/operations_/backup_restore/_snippets/_s3_settings.md'; +import ExampleSetup from '@site/docs/operations_/backup_restore/_snippets/_example_setup.md'; +import Syntax from '@site/docs/operations_/backup_restore/_snippets/_syntax.md'; + +# BACKUP / RESTORE to disk {#backup-to-a-local-disk} + +## Syntax {#syntax} + + + +## Configure backup destinations for disk {#configure-backup-destinations-for-disk} + +### Configure a backup destination for local disk {#configure-a-backup-destination} + +In the examples below you will see the backup destination specified as `Disk('backups', '1.zip')`. +To use the `Disk` backup engine it is necessary to first add a file specifying +the backup destination at the path below: + +```text +/etc/clickhouse-server/config.d/backup_disk.xml +``` + +For example, the configuration below defines a disk named `backups` and then adds that disk to +the **allowed_disk** list of **backups**: + +```xml + + + + + + local + /backups/ + + + + + + backups + /backups/ + + + +``` + +### Configure a backup destination for S3 disk {#backuprestore-using-an-s3-disk} + +It is also possible to `BACKUP`/`RESTORE` to S3 by configuring an S3 disk in the +ClickHouse storage configuration. Configure the disk like this by adding a file to +`/etc/clickhouse-server/config.d` as was done above for the local disk. + +```xml + + + + + s3_plain + + + + + + + + +
+ s3_plain +
+
+
+
+
+ + + s3_plain + +
+``` + +`BACKUP`/`RESTORE` for S3 disk is done in the same way as for local disk: + +```sql +BACKUP TABLE data TO Disk('s3_plain', 'cloud_backup'); +RESTORE TABLE data AS data_restored FROM Disk('s3_plain', 'cloud_backup'); +``` + +:::note +- This disk should not be used for `MergeTree` itself, only for `BACKUP`/`RESTORE` +- If your tables are backed by S3 storage and the types of the disks are different, +it doesn't use `CopyObject` calls to copy parts to the destination bucket, instead, +it downloads and uploads them, which is very inefficient. In this case prefer using +the `BACKUP ... TO S3()` syntax for this use-case. +::: + +## Usage examples of backup/restore to local disk {#usage-examples} + +### Backup and restore a table {#backup-and-restore-a-table} + + + +To backup the table you can run: + +```sql title="Query" +BACKUP TABLE test_db.test_table TO Disk('backups', '1.zip') +``` + +```response title="Response" + ┌─id───────────────────────────────────┬─status─────────┐ +1. │ 065a8baf-9db7-4393-9c3f-ba04d1e76bcd │ BACKUP_CREATED │ + └──────────────────────────────────────┴────────────────┘ +``` + +The table can be restored from the backup using the following command if the table is empty: + +```sql title="Query" +RESTORE TABLE test_db.test_table FROM Disk('backups', '1.zip') +``` + +```response title="Response" + ┌─id───────────────────────────────────┬─status───┐ +1. │ f29c753f-a7f2-4118-898e-0e4600cd2797 │ RESTORED │ + └──────────────────────────────────────┴──────────┘ +``` + +:::note +The above `RESTORE` would fail if the table `test.table` contains data. +The setting `allow_non_empty_tables=true` allows `RESTORE TABLE` to insert data +into non-empty tables. This will mix earlier data in the table with the data extracted from the backup. +This setting can therefore cause data duplication in the table, and should be used with caution. +::: + +To restore the table with data already in it, run: + +```sql +RESTORE TABLE test_db.table_table FROM Disk('backups', '1.zip') +SETTINGS allow_non_empty_tables=true +``` + +Tables can be restored, or backed up, with new names: + +```sql +RESTORE TABLE test_db.table_table AS test_db.test_table_renamed FROM Disk('backups', '1.zip') +``` + +The backup archive for this backup has the following structure: + +```text +├── .backup +└── metadata + └── test_db + └── test_table.sql +``` + + + +Formats other than zip can be used. See ["Backups as tar archives"](#backups-as-tar-archives) +below for further details. + +### Incremental backups to disk {#incremental-backups} + +A base backup in ClickHouse is the initial, full backup from which the following +incremental backups are created. Incremental backups only store the changes +made since the base backup, so the base backup must be kept available to +restore from any incremental backup. The base backup destination can be set with setting +`base_backup`. + +:::note +Incremental backups depend on the base backup. The base backup must be kept available +to be able to restore from an incremental backup. +::: + +To make an incremental backup of a table, first make a base backup: + +```sql +BACKUP TABLE test_db.test_table TO Disk('backups', 'd.zip') +``` + +```sql +BACKUP TABLE test_db.test_table TO Disk('backups', 'incremental-a.zip') +SETTINGS base_backup = Disk('backups', 'd.zip') +``` + +All data from the incremental backup and the base backup can be restored into a +new table `test_db.test_table2` with command: + +```sql +RESTORE TABLE test_db.test_table AS test_db.test_table2 +FROM Disk('backups', 'incremental-a.zip'); +``` + +### Securing a backup {#assign-a-password-to-the-backup} + +Backups written to disk can have a password applied to the file. +The password can be specified using the `password` setting: + +```sql +BACKUP TABLE test_db.test_table +TO Disk('backups', 'password-protected.zip') +SETTINGS password='qwerty' +``` + +To restore a password-protected backup, the password must again +be specified using the `password` setting: + +```sql +RESTORE TABLE test_db.test_table +FROM Disk('backups', 'password-protected.zip') +SETTINGS password='qwerty' +``` + +### Backups as tar archives {#backups-as-tar-archives} + +Backups can be stored not only as zip archives, but also as tar archives. +The functionality is the same as for zip, except that password protection is not +supported for tar archives. Additionally, tar archives support a variety of +compression methods. + +To make a backup of a table as a tar: + +```sql +BACKUP TABLE test_db.test_table TO Disk('backups', '1.tar') +``` + +to restore from a tar archive: + +```sql +RESTORE TABLE test_db.test_table FROM Disk('backups', '1.tar') +``` + +To change the compression method, the correct file suffix should be appended to +the backup name. For example, to compress the tar archive using gzip run: + +```sql +BACKUP TABLE test_db.test_table TO Disk('backups', '1.tar.gz') +``` + +The supported compression file suffixes are: +- `tar.gz` +- `.tgz` +- `tar.bz2` +- `tar.lzma` +- `.tar.zst` +- `.tzst` +- `.tar.xz` + +### Compression settings {#compression-settings} + +The compression method and level of compression can be specified using +setting `compression_method` and `compression_level` respectively. + + + +```sql +BACKUP TABLE test_db.test_table +TO Disk('backups', 'filename.zip') +SETTINGS compression_method='lzma', compression_level=3 +``` + +### Restore specific partitions {#restore-specific-partitions} + +If specific partitions associated with a table need to be restored, these can be specified. + +Let's create a simple partitioned table into four parts, insert some data into it and then +take a backup of only the first and fourth partitions: + +
+ +Setup + +```sql +CREATE IF NOT EXISTS test_db; + +-- Create a partitioend table +CREATE TABLE test_db.partitioned ( + id UInt32, + data String, + partition_key UInt8 +) ENGINE = MergeTree() +PARTITION BY partition_key +ORDER BY id; + +INSERT INTO test_db.partitioned VALUES +(1, 'data1', 1), +(2, 'data2', 2), +(3, 'data3', 3), +(4, 'data4', 4); + +SELECT count() FROM test_db.partitioned; + +SELECT partition_key, count() +FROM test_db.partitioned +GROUP BY partition_key +ORDER BY partition_key; +``` + +```response + ┌─count()─┐ +1. │ 4 │ + └─────────┘ + ┌─partition_key─┬─count()─┐ +1. │ 1 │ 1 │ +2. │ 2 │ 1 │ +3. │ 3 │ 1 │ +4. │ 4 │ 1 │ + └───────────────┴─────────┘ +``` + +
+ +Run the following command to back up partitions 1 and 4: + +```sql +BACKUP TABLE test_db.partitioned PARTITIONS '1', '4' +TO Disk('backups', 'partitioned.zip') +``` + +Run the following command to restore partitions 1 and 4: + +```sql +RESTORE TABLE test_db.partitioned PARTITIONS '1', '4' +FROM Disk('backups', 'partitioned.zip') +SETTINGS allow_non_empty_tables=true +``` \ No newline at end of file diff --git a/docs/operations_/backup_restore/02_s3_endpoint.md b/docs/operations_/backup_restore/02_s3_endpoint.md new file mode 100644 index 00000000000..5c07dd6f390 --- /dev/null +++ b/docs/operations_/backup_restore/02_s3_endpoint.md @@ -0,0 +1,181 @@ +--- +description: 'Overview of ClickHouse backup and restore' +sidebar_label: 'S3 endpoint' +slug: /operations/backup/s3_endpoint +title: 'Backup and restore to/from an S3 endpoint' +doc_type: 'guide' +--- + +import Syntax from '@site/docs/operations_/backup_restore/_snippets/_syntax.md'; + +# BACKUP / RESTORE to or from an S3 endpoint {#backup-to-a-local-disk} + +This article covers backing up or restoring backups to/from an S3 bucket +via an S3 endpoint. + +## Syntax {#syntax} + + + +## Usage example {#usage-examples} + +### Incremental backup to an S3 endpoint {#incremental-backup-to-an-s3-endpoint} + +In this example, we will create a backup to an S3 endpoint and then restore from it +again. + +:::note +For an explanation of the differences between a full backup and an incremental +backup, see ["Backup types"](/operations/backup/overview/#backup-types) +::: + +You will need the following information to use this method: + +| Parameter | Example | +|-------------------|--------------------------------------------------------------| +| An S3 endpoint | `https://backup-ch-docs.s3.us-east-1.amazonaws.com/backups/` | +| Access key ID | `BKIOZLE2VYN3VXXTP9RC` | +| Secret access key | `40bwYnbqN7xU8bVePaUCh3+YEyGXu8UOMV9ANpwL` | + +:::tip +Creating an S3 bucket is covered in section ["use S3 Object Storage as a ClickHouse disk"](/integrations/data-ingestion/s3/index.md#configuring-s3-for-clickhouse-use) +::: + +The destination for a backup is specified as: + +```sql +S3('/', '', '', '') +``` +
+ + +#### Setup {#create-a-table} + +Create the following database and table and insert some random data into it: + +```sql +CREATE DATABASE IF NOT EXISTS test_db; +CREATE TABLE test_db.test_table +( + `key` Int, + `value` String, + `array` Array(String) +) +ENGINE = MergeTree +ORDER BY tuple() +``` + +```sql +INSERT INTO test_db.test_table SELECT * +FROM generateRandom('key Int, value String, array Array(String)') +LIMIT 1000 +``` + +#### Create a base backup {#create-a-base-initial-backup} + +Incremental backups require a _base_ backup to start from. The first parameter of +the S3 destination is the S3 endpoint followed by the directory within the bucket +to use for this backup. In this example the directory is named `my_backup`. + +Run the following command to create the base backup: + +```sql +BACKUP TABLE test_db.test_table TO S3( +'https://backup-ch-docs.s3.us-east-1.amazonaws.com/backups/base_backup', +'', +'' +) +``` + +```response +┌─id───────────────────────────────────┬─status─────────┐ +│ de442b75-a66c-4a3c-a193-f76f278c70f3 │ BACKUP_CREATED │ +└──────────────────────────────────────┴────────────────┘ +``` + +#### Add more data {#add-more-data} + +Incremental backups are populated with the difference between the base backup and +the current content of the table being backed up. Add more data before taking the +incremental backup: + +```sql +INSERT INTO test_db.test_table SELECT * +FROM generateRandom('key Int, value String, array Array(String)') +LIMIT 100 +``` + +#### Take an incremental backup {#take-an-incremental-backup} + +This backup command is similar to the base backup, but adds `SETTINGS base_backup` and the location of the base backup. Note that the destination for the incremental backup is not the same directory as the base, it is the same endpoint with a different target directory within the bucket. The base backup is in `my_backup`, and the incremental will be written to `my_incremental`: + +```sql +BACKUP TABLE test_db.test_table TO S3( +'https://backup-ch-docs.s3.us-east-1.amazonaws.com/backups/incremental_backup', +'', +'' +) +SETTINGS base_backup = S3( +'https://backup-ch-docs.s3.us-east-1.amazonaws.com/backups/base_backup', +'', +'' +) +``` + +```response +┌─id───────────────────────────────────┬─status─────────┐ +│ f6cd3900-850f-41c9-94f1-0c4df33ea528 │ BACKUP_CREATED │ +└──────────────────────────────────────┴────────────────┘ +``` + +#### Restore from the incremental backup {#restore-from-the-incremental-backup} + +This command restores the incremental backup into a new table, `test_table_restored`. +Note that when an incremental backup is restored, the base backup is also included. +Specify only the **incremental backup** when restoring: + +```sql +RESTORE TABLE data AS test_db.test_table_restored FROM S3( +'https://backup-ch-docs.s3.us-east-1.amazonaws.com/backups/incremental_backup', +'', +'' +) +``` + +```response +┌─id───────────────────────────────────┬─status───┐ +│ ff0c8c39-7dff-4324-a241-000796de11ca │ RESTORED │ +└──────────────────────────────────────┴──────────┘ +``` + +#### Verify the count {#verify-the-count} + +There were two inserts into the original table `data`, one with 1,000 rows and one with 100 rows, for a total of 1,100. +Verify that the restored table has 1,100 rows: + +```sql +SELECT count() +FROM test_db.test_table_restored +``` + +```response +┌─count()─┐ +│ 1100 │ +└─────────┘ +``` + +#### Verify the content {#verify-the-content} + +This compares the content of the original table, `test_table` with the restored table `test_table_restored`: + +```sql +SELECT throwIf(( + SELECT groupArray(tuple(*)) + FROM test_db.test_table + ) != ( + SELECT groupArray(tuple(*)) + FROM test_db.test_table_restored +), 'Data does not match after BACKUP/RESTORE') +``` + + diff --git a/docs/operations_/backup_restore/03_azure_blob_storage.md b/docs/operations_/backup_restore/03_azure_blob_storage.md new file mode 100644 index 00000000000..19d0ebc410f --- /dev/null +++ b/docs/operations_/backup_restore/03_azure_blob_storage.md @@ -0,0 +1,37 @@ +--- +description: 'Details backup/restore to or from an Azure Blob Storage endpoint' +sidebar_label: 'AzureBlobStorage' +slug: /operations/backup/azure +title: 'Backup and restore to/from Azure Blob Storage' +doc_type: 'guide' +--- + +import Syntax from '@site/docs/operations_/backup_restore/_snippets/_syntax.md'; + +# BACKUP/RESTORE to or from Azure Blob Storage {#backup-to-azure-blob-storage} + +## Syntax {#syntax} + + + +## Configuring BACKUP / RESTORE to use an AzureBlobStorage endpoint {#configuring-backuprestore-to-use-an-azureblobstorage-endpoint} + +To write backups to an AzureBlobStorage container you need the following pieces of information: +- AzureBlobStorage endpoint connection string / url, +- Container, +- Path, +- Account name (if url is specified) +- Account Key (if url is specified) + +The destination for a backup will be specified as: + +```sql +AzureBlobStorage('/', '', '', '', '') +``` + +```sql +BACKUP TABLE data TO AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'testcontainer', 'data_backup'); +RESTORE TABLE data AS data_restored FROM AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'testcontainer', 'data_backup'); +``` diff --git a/docs/operations_/backup_restore/04_alternative_methods.md b/docs/operations_/backup_restore/04_alternative_methods.md new file mode 100644 index 00000000000..3175c4fde61 --- /dev/null +++ b/docs/operations_/backup_restore/04_alternative_methods.md @@ -0,0 +1,54 @@ +--- +description: 'Details alternative backup or restore methods' +sidebar_label: 'Alternative methods' +slug: /operations/backup/alternative_methods +title: 'Alternative backup or restore methods' +doc_type: 'reference' +--- + +# Alternative backup methods + +ClickHouse stores data on disk, and there are many ways to back up disks. +These are some alternatives that have been used in the past, and that may fit +your use case. + +### Duplicating source data somewhere else {#duplicating-source-data-somewhere-else} + +Often data ingested into ClickHouse is delivered through some sort of persistent +queue, such as [Apache Kafka](https://kafka.apache.org). In this case, it is possible to configure an +additional set of subscribers that will read the same data stream while it is +being written to ClickHouse and store it in cold storage somewhere. Most companies +already have some default recommended cold storage, which could be an object store +or a distributed filesystem like [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html). + +### Filesystem Snapshots {#filesystem-snapshots} + +Some local filesystems provide snapshot functionality (for example, [ZFS](https://en.wikipedia.org/wiki/ZFS)), +but they might not be the best choice for serving live queries. A possible solution +is to create additional replicas with this kind of filesystem and exclude them +from the [Distributed](/engines/table-engines/special/distributed) tables that are used for `SELECT` queries. +Snapshots on such replicas will be out of reach of any queries that modify data. +As a bonus, these replicas might have special hardware configurations with more +disks attached per server, which would be cost-effective. + +For smaller volumes of data, a simple `INSERT INTO ... SELECT ...` to remote tables +might work as well. + +### Manipulations with Parts {#manipulations-with-parts} + +ClickHouse allows using the `ALTER TABLE ... FREEZE PARTITION ...` query to create +a local copy of table partitions. This is implemented using hardlinks to the `/var/lib/clickhouse/shadow/` +folder, so it usually does not consume extra disk space for old data. The created +copies of files are not handled by ClickHouse server, so you can just leave them there: +you will have a simple backup that does not require any additional external system, +but it will still be prone to hardware issues. For this reason, it's better to +remotely copy them to another location and then remove the local copies. +Distributed filesystems and object stores are still a good options for this, +but normal attached file servers with a large enough capacity might work as well +(in this case the transfer will occur via the network filesystem or maybe [rsync](https://en.wikipedia.org/wiki/Rsync)). +Data can be restored from backup using the `ALTER TABLE ... ATTACH PARTITION ...` + +For more information about queries related to partition manipulations, see the +[`ALTER` documentation](/sql-reference/statements/alter/partition). + +A third-party tool is available to automate this approach: [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup). diff --git a/docs/operations_/backup_restore/_snippets/_azure_settings.md b/docs/operations_/backup_restore/_snippets/_azure_settings.md new file mode 100644 index 00000000000..a8c305593a6 --- /dev/null +++ b/docs/operations_/backup_restore/_snippets/_azure_settings.md @@ -0,0 +1,3 @@ +| Setting | Description | Default value | +|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| +| `azure_attempt_to_create_container` | When using Azure Blob Storage, whether to attempt creating the specified container if it doesn't exist. | `true`| | | \ No newline at end of file diff --git a/docs/operations_/backup_restore/_snippets/_example_setup.md b/docs/operations_/backup_restore/_snippets/_example_setup.md new file mode 100644 index 00000000000..7e5ffd21188 --- /dev/null +++ b/docs/operations_/backup_restore/_snippets/_example_setup.md @@ -0,0 +1,74 @@ +Run the following commands below to create the test database and table we will be +making a backup and restoration of in this example: + +
+Setup commands + +Create the database and table: + +```sql +CREATE DATABASE test_db; + +CREATE TABLE test_db.test_table ( + id UUID, + name String, + email String, + age UInt8, + salary UInt32, + created_at DateTime, + is_active UInt8, + department String, + score Float32, + country String +) ENGINE = MergeTree() +ORDER BY id; +``` + +Preprocess and one thousand rows of random data: + +```sql +INSERT INTO test_table (id, name, email, age, salary, created_at, is_active, department, score, country) +SELECT + generateUUIDv4() as id, + concat('User_', toString(rand() % 10000)) as name, + concat('user', toString(rand() % 10000), '@example.com') as email, + 18 + (rand() % 65) as age, + 30000 + (rand() % 100000) as salary, + now() - toIntervalSecond(rand() % 31536000) as created_at, + rand() % 2 as is_active, + arrayElement(['Engineering', 'Marketing', 'Sales', 'HR', 'Finance', 'Operations'], (rand() % 6) + 1) as department, + rand() / 4294967295.0 * 100 as score, + arrayElement(['USA', 'UK', 'Germany', 'France', 'Canada', 'Australia', 'Japan', 'Brazil'], (rand() % 8) + 1) as country +FROM numbers(1000); +``` + +Next you will need to create a file specifying the backup destination at the +path below: + +```text +/etc/clickhouse-server/config.d/backup_disk.xml +``` + +```xml + + + + + local + /backups/ -- for MacOS choose: /Users/backups/ + + + + + backups + /backups/ -- for MacOS choose: /Users/backups/ + + +``` + +:::note +If clickhouse-server is running you will need to restart it for the changes to +take effect. +::: + +
\ No newline at end of file diff --git a/docs/operations_/backup_restore/_snippets/_generic_settings.md b/docs/operations_/backup_restore/_snippets/_generic_settings.md new file mode 100644 index 00000000000..e2e41696ee1 --- /dev/null +++ b/docs/operations_/backup_restore/_snippets/_generic_settings.md @@ -0,0 +1,31 @@ +| Setting | Description | Default value | +|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| +| `id` | ID of backup or restore operation, randomly generated UUID is used if not specified. If there's already a running operation with the same ID, an exception is thrown. | | +| `compression_method` | Specifies the compression method for the backup. See section ["column compression codecs"](/sql-reference/statements/create/table#column_compression_codec) | | +| `compression_level` | Specifies the compression level for the backup | | +| `password` | Password for the file on disk. | | +| `base_backup` | The destination of the base backup used for incremental backups. For example: `Disk('backups', '1.zip')` | | +| `use_same_password_for_base_backup` | Whether base backup archive should inherit the password from the query. | | +| `structure_only` | If enabled, only backs up or restores the CREATE statements without the actual table data. | | +| `storage_policy` | Storage policy for the tables being restored. See ["using multiple block devices for data storage](/engines/table-engines/mergetree-family/mergetree#table_engine-mergetree-multiple-volumes). Only applicable to the `RESTORE` command. Applies only to tables with an engine from the `MergeTree` family. | | +| `allow_non_empty_tables` | Allows `RESTORE TABLE` to insert data into non-empty tables. This will mix earlier data in the table with the data extracted from the backup. This setting can therefore cause data duplication in the table, use with caution. | `0` | +| `backup_restore_keeper_max_retries` | Max retries for [Zoo]Keeper operations in the middle of a BACKUP or RESTORE operation. Should be big enough so the whole operation won't fail because of a temporary [Zoo]Keeper failure. | `1000` | +| `backup_restore_keeper_retry_initial_backoff_ms` | Initial backoff timeout for [Zoo]Keeper operations during backup or restore | `100` | +| `backup_restore_keeper_retry_max_backoff_ms` | Max backoff timeout for [Zoo]Keeper operations during backup or restore | `5000` | +| `backup_restore_failure_after_host_disconnected_for_seconds` | If a host during a `BACKUP ON CLUSTER` or `RESTORE ON CLUSTER` operation doesn't recreate its ephemeral 'alive' node in ZooKeeper for this amount of time then the whole backup or restore is considered as failed. This value should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure. Zero means unlimited. | `3600` | +| `backup_restore_keeper_max_retries_while_initializing` | Max retries for [Zoo]Keeper operations during the initialization of a `BACKUP ON CLUSTER` or `RESTORE ON CLUSTER` operation. | `20` | +| `backup_restore_keeper_max_retries_while_handling_error` | Max retries for [Zoo]Keeper operations while handling an error of a `BACKUP ON CLUSTER` or `RESTORE ON CLUSTER` operation. | `20` | +| `backup_restore_finish_timeout_after_error_sec` | How long the initiator should wait for other host to react to the 'error' node and stop their work on the current `BACKUP ON CLUSTER` or `RESTORE ON CLUSTER` operation. | `180` | +| `backup_restore_keeper_value_max_size` | Maximum size of data of a [Zoo]Keeper's node during backup | `1048576` | +| `backup_restore_batch_size_for_keeper_multi` | Maximum size of batch for multi request to [Zoo]Keeper during backup or restore | `1000` | +| `backup_restore_batch_size_for_keeper_multiread` | Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore | `10000` | +| `backup_restore_keeper_fault_injection_probability` | Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval `[0.0f, 1.0f]` | `0` | +| `backup_restore_keeper_fault_injection_seed` | `0` for a random seed, otherwise the setting value | `0` | +| `backup_restore_s3_retry_attempts` | Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore. | `1000` | +| `max_backup_bandwidth` | The maximum read speed in bytes per second for particular backup on server. Zero means unlimited. | `0` | +| `max_backups_io_thread_pool_size` | ClickHouse uses threads from the Backups IO Thread pool to do S3 backup IO operations. `max_backups_io_thread_pool_size` limits the maximum number of threads in the pool. | `1000` | +| `max_backups_io_thread_pool_free_size` | If the number of **idle** threads in the Backups IO Thread pool exceeds `max_backup_io_thread_pool_free_size`, ClickHouse will release resources occupied by idling threads and decrease the pool size. Threads can be created again if necessary. | `0` | +| `backups_io_thread_pool_queue_size` | The maximum number of jobs that can be scheduled on the Backups IO Thread pool. It is recommended to keep this queue unlimited due to the current S3 backup logic. Note: A value of `0` (default) means unlimited. | `0` | +| `backup_threads` | The maximum number of threads to execute `BACKUP` requests. | | +| `max_backup_bandwidth_for_server` | The maximum read speed in bytes per second for all backups on server. Zero means unlimited. | `0` | +| `shutdown_wait_backups_and_restores` | If set to true ClickHouse will wait for running backups and restores to finish before shutdown. | `1` | \ No newline at end of file diff --git a/docs/operations_/backup_restore/_snippets/_s3_settings.md b/docs/operations_/backup_restore/_snippets/_s3_settings.md new file mode 100644 index 00000000000..c92d8bbdcdb --- /dev/null +++ b/docs/operations_/backup_restore/_snippets/_s3_settings.md @@ -0,0 +1,4 @@ +| Setting | Description | Default value | +|-----------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| +| `use_same_s3_credentials_for_base_backup` | Whether base backup to S3 should inherit credentials from the query. Only works with `S3`. | | +| `s3_storage_class` | The storage class used for S3 backup. For example: `STANDARD` | | \ No newline at end of file diff --git a/docs/operations_/backup_restore/_snippets/_syntax.md b/docs/operations_/backup_restore/_snippets/_syntax.md new file mode 100644 index 00000000000..bde43a3cd3c --- /dev/null +++ b/docs/operations_/backup_restore/_snippets/_syntax.md @@ -0,0 +1,25 @@ +```sql +-- core commands +BACKUP | RESTORE [ASYNC] +--- what to backup/restore (or exclude) +TABLE [db.]table_name [AS [db.]table_name_in_backup] | +DICTIONARY [db.]dictionary_name [AS [db.]name_in_backup] | +DATABASE database_name [AS database_name_in_backup] | +TEMPORARY TABLE table_name [AS table_name_in_backup] | +VIEW view_name [AS view_name_in_backup] | +[EXCEPT TABLES ...] | +ALL [EXCEPT {TABLES|DATABASES}...] } [,...] +--- +[ON CLUSTER 'cluster_name'] +--- where to backup or restore to or from +TO|FROM +File('/') | +Disk('', '/') | +S3('/', '', '', '') | +AzureBlobStorage('/', '', '', '', '') +--- additional settings +[SETTINGS ...] +``` + +**See ["command summary"](/operations/backup/overview/#command-summary) for more details +of each command.** \ No newline at end of file diff --git a/plugins/floating-pages-exceptions.txt b/plugins/floating-pages-exceptions.txt index b68aff5676c..60f9123b3ec 100644 --- a/plugins/floating-pages-exceptions.txt +++ b/plugins/floating-pages-exceptions.txt @@ -10,3 +10,4 @@ integrations/language-clients/java/client-v1 integrations/language-clients/java/jdbc-v1 integrations/data-ingestion/clickpipes/postgres/maintenance.md +operations/backup.md diff --git a/sidebars.js b/sidebars.js index e4ca70c8456..84494802054 100644 --- a/sidebars.js +++ b/sidebars.js @@ -84,7 +84,7 @@ const sidebars = { "guides/inserting-data", "guides/writing-queries", "guides/developer/mutations", - "guides/generating-test-data" + "guides/generating-test-data" ], }, { @@ -134,7 +134,7 @@ const sidebars = { label: "Capabilities", collapsed: true, collapsible: true, - items: [ + items: [ "use-cases/observability/cloud-monitoring", "use-cases/observability/self-managed-monitoring" ] @@ -743,7 +743,7 @@ const sidebars = { items: [ "integrations/data-ingestion/kafka/index", "integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink", - "integrations/data-ingestion/kafka/confluent/confluent-cloud", + "integrations/data-ingestion/kafka/confluent/confluent-cloud", "integrations/data-ingestion/kafka/confluent/custom-connector", "integrations/data-ingestion/kafka/msk/index", "integrations/data-ingestion/kafka/kafka-vector", @@ -1301,6 +1301,19 @@ const sidebars = { "operations/ssl-zookeeper", "operations/startup-scripts", "operations/storing-data", + "operations/allocation-profiling", + { + type: "category", + label: "Backup/Restore", + collapsed: true, + collapsible: true, + items: [ + { + type: "autogenerated", + dirName: "operations_/backup_restore", + } + ] + }, { type: "category", label: "Allocation profiling", @@ -1312,7 +1325,6 @@ const sidebars = { "operations/allocation-profiling-old", ] }, - "operations/backup", "operations/caches", "operations/workload-scheduling", "operations/update", diff --git a/src/theme/CodeBlock/index.js b/src/theme/CodeBlock/index.js index 230ef8f6b2a..5d816932395 100644 --- a/src/theme/CodeBlock/index.js +++ b/src/theme/CodeBlock/index.js @@ -6,12 +6,12 @@ function countLines(text = '') { if (typeof text !== 'string') { return 1; // Default to 1 line for non-string inputs } - + const trimmedText = text.trim(); if (!trimmedText) { return 1; // Return 1 for empty or whitespace-only strings } - + const lines = trimmedText.split('\n'); return Math.max(1, lines.length); // Ensure at least 1 line is returned } @@ -77,9 +77,9 @@ export default function CodeBlockWrapper(props) { return (
{/* Invisible content for crawlers/SEO */} -
- + {/* Visible loading animation */}
); - } - + } + return ( <>