diff --git a/docker/compose-controller-spark-sql-external-storage.yaml b/docker/compose-controller-spark-sql-external-storage.yaml new file mode 100644 index 000000000..bdc14448f --- /dev/null +++ b/docker/compose-controller-spark-sql-external-storage.yaml @@ -0,0 +1,109 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This docker-compose configuration is for bringing up a pipeline controller +# along with a single-process Spark environment with a JDBC endpoint. + +# Environment variables: +# +# PIPELINE_CONFIG: The directory that contains pipeline configurations, namely +# application.yaml and flink-conf.yaml files. +# +# DWH_ROOT: The directory where Parquet files are written. This is shared +# between all containers; the pipeline writes to it and Spark ones read. +# +# Note if local paths are used, they should start with `./ `or `../`. Also the +# mounted files should be readable by containers, e.g., world-readable. +# + +# NOTES ON SPARK: +# This is a very simple single-process Spark configuration to be able to run +# SQL queries against Parquet files generated by the pipeline. It exposes an +# endpoint on port 10001 which can be used for JDBC connection from any SQL +# client. +# +# For a more complete configuration which shows different pieces that are needed +# for a cluster environment, please see `compose-controller-spark-sql.yaml`. + +# NOTES ON METASTORE: +# This configuration uses the default embedded Derby database as Metastore for +# the thriftserver. Example config lines are provided (but commented out) that +# show how to use an external DB instead. + +# OTHER CONFIGS: +# If you want to change Spark default configs, you can mount your config files +# to /opt/bitnami/spark/conf/ +# https://spark.apache.org/docs/latest/configuration.html + +version: '2' + +services: + drivers-build: + container_name: drivers-build + build: + context: ./drivers-build + command: + # copies the drivers from the drivers-build folder to JdbcDrivers volume to be used by the spark containers + - /bin/sh + - -ec + - |- + cp -R /jdbcDrivers/* /drivers-build/jdbcDrivers + volumes: + - jdbcDrivers:/drivers-build/jdbcDrivers + pipeline-controller: + # to force a build use `--build` option of `docker-compose up`. + build: + context: .. + container_name: pipeline-controller + volumes: + - ${PIPELINE_CONFIG}:/app/config:ro + - ${DWH_ROOT}:/dwh + ports: + - '8090:8080' + + spark: + image: docker.io/bitnami/spark:3.3 + container_name: spark-thriftserver + command: + # copies the drivers to the jars directory before the thrift server starts. + - /bin/bash + - -ec + - |- + cp -R /drivers-build/jdbcDrivers/* /opt/bitnami/spark/jars/ + sbin/start-thriftserver.sh + environment: + - HIVE_SERVER2_THRIFT_PORT=10000 + ports: + - '10001:10000' + - '4041:4040' + volumes: + - ${DWH_ROOT}:/dwh + - ./hive-site_example.xml:/opt/bitnami/spark/conf/hive-site.xml + volumes_from: + - drivers-build + + postgres: + image: postgres:14 + ports: + - "5470:5432" + environment: + - "POSTGRES_PASSWORD=admin" + - "POSTGRES_USER=admin" + - "POSTGRES_DB=custom_metastore_db" + volumes: + - pgdata:/var/lib/postgresql/data + +volumes: + jdbcDrivers: + pgdata: diff --git a/docker/drivers-build/Dockerfile b/docker/drivers-build/Dockerfile new file mode 100644 index 000000000..5d02ba0d7 --- /dev/null +++ b/docker/drivers-build/Dockerfile @@ -0,0 +1,12 @@ +FROM alpine:3.17.3 + +WORKDIR /jdbcDrivers + +ARG POSTGRESQL_DRIVER_VERSION=42.6.0 + +# Install required packages +RUN apk update && apk add curl + +# Fetch drivers +RUN curl -s https://jdbc.postgresql.org/download/postgresql-$POSTGRESQL_DRIVER_VERSION.jar \ + -o postgresql-$POSTGRESQL_DRIVER_VERSION.jar diff --git a/docker/drivers-build/README.md b/docker/drivers-build/README.md new file mode 100644 index 000000000..a87e70180 --- /dev/null +++ b/docker/drivers-build/README.md @@ -0,0 +1,6 @@ +# Overview + +The docker/drivers-build directory contains a sample Dockerfile for packaging postgresql database JDBC driver to be used by spark-thriftserver as an external storage. See demonstration [docker compose file](../compose-controller-spark-sql-external-storage.yaml). + + + > This folder should be ignored during continuous integration tests. diff --git a/docker/hive-site_example.xml b/docker/hive-site_example.xml index 7fb57f6c7..ad93e0d25 100644 --- a/docker/hive-site_example.xml +++ b/docker/hive-site_example.xml @@ -17,7 +17,8 @@ https://cwiki.apache.org/confluence/display/Hive/Configuration+Properties#Config - jdbc:postgresql://172.18.0.1:5432/custom_metastore_db + + jdbc:postgresql://postgres:5432/custom_metastore_db