From 67a1ef868ed3b0c2e4e90e6c7ca3729df39952db Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 9 Dec 2025 15:00:48 +0100 Subject: [PATCH 01/14] attempt at converting duckdb to column conversions. WIP --- .../kotlinx/dataframe/io/db/DuckDb.kt | 348 ++++++++++++------ gradle/libs.versions.toml | 2 +- 2 files changed, 241 insertions(+), 109 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index d6882133e7..b353a206a2 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -1,7 +1,8 @@ package org.jetbrains.kotlinx.dataframe.io.db import io.github.oshai.kotlinlogging.KotlinLogging -import org.duckdb.DuckDBColumnType +import kotlinx.datetime.LocalDate +import kotlinx.datetime.LocalTime import org.duckdb.DuckDBColumnType.ARRAY import org.duckdb.DuckDBColumnType.BIGINT import org.duckdb.DuckDBColumnType.BIT @@ -39,32 +40,45 @@ import org.duckdb.DuckDBColumnType.UUID import org.duckdb.DuckDBColumnType.VARCHAR import org.duckdb.DuckDBResultSetMetaData import org.duckdb.JsonNode +import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.asValueColumn +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.convertTo +import org.jetbrains.kotlinx.dataframe.api.convertToLocalDate +import org.jetbrains.kotlinx.dataframe.api.convertToLocalTime +import org.jetbrains.kotlinx.dataframe.api.map +import org.jetbrains.kotlinx.dataframe.api.single +import org.jetbrains.kotlinx.dataframe.columns.ValueColumn import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig import org.jetbrains.kotlinx.dataframe.io.db.DuckDb.convertSqlTypeToKType import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.math.BigDecimal import java.math.BigInteger -import java.sql.Array import java.sql.Blob import java.sql.Connection import java.sql.DatabaseMetaData import java.sql.DriverManager import java.sql.ResultSet import java.sql.Struct -import java.sql.Timestamp -import java.time.LocalDate -import java.time.LocalTime -import java.time.OffsetDateTime -import java.time.OffsetTime import java.util.Properties -import java.util.UUID import kotlin.reflect.KType import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf +import kotlin.time.Instant +import kotlin.time.toKotlinInstant +import kotlin.uuid.Uuid +import java.sql.Array as SqlArray +import java.sql.Timestamp as SqlTimestamp +import java.time.LocalDate as JavaLocalDate +import java.time.LocalTime as JavaLocalTime +import java.time.OffsetDateTime as JavaOffsetDateTime +import java.time.OffsetTime as JavaOffsetTime +import java.util.UUID as JavaUUID private val logger = KotlinLogging.logger {} @@ -80,120 +94,56 @@ public object DuckDb : DbType("duckdb") { override val driverClassName: String = "org.duckdb.DuckDBDriver" /** - * How a column type from JDBC, [tableColumnMetadata], is read in Java/Kotlin. - * The returned type must exactly follow [ResultSet.getObject] of your specific database's JDBC driver. - * Returning `null` defer the implementation to the default one (which may not always be correct). - * - * Following [org.duckdb.DuckDBVector.getObject]. + * TODO: Unclear what this returned [KType] is useful for. Let's remove this function and just have + * [convertSqlTypeToColumnSchemaValue] */ override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType = - tableColumnMetadata.sqlTypeName.toKType(tableColumnMetadata.isNullable) + convertSqlTypeToColumnSchemaValue(tableColumnMetadata).type /** * How a column from JDBC should be represented as DataFrame (value) column * See [convertSqlTypeToKType]. */ - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema { - val type = convertSqlTypeToKType(tableColumnMetadata) - return ColumnSchema.Value(type) - } + override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema = + parseDuckDbType(tableColumnMetadata.sqlTypeName, tableColumnMetadata.isNullable).targetSchema /** - * Follows exactly [org.duckdb.DuckDBVector.getObject]. - * - * "// dataframe-jdbc" is added for all types that are covered correctly by - * [org.jetbrains.kotlinx.dataframe.io.db.DbType.makeCommonSqlToKTypeMapping] at the moment, however, to cover - * all nested types, we'll use a full type-map for all [DuckDB types][DuckDBColumnType] exactly. + * TODO: This function achieves the same goal as [convertSqlTypeToKType]. */ - @Suppress("ktlint:standard:blank-line-between-when-conditions") - internal fun String.toKType(isNullable: Boolean): KType { - val sqlTypeName = this - return when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { - BOOLEAN -> typeOf() // dataframe-jdbc - TINYINT -> typeOf() - SMALLINT -> typeOf() - INTEGER -> typeOf() // dataframe-jdbc - BIGINT -> typeOf() // dataframe-jdbc - HUGEINT -> typeOf() - UHUGEINT -> typeOf() - UTINYINT -> typeOf() - USMALLINT -> typeOf() - UINTEGER -> typeOf() - UBIGINT -> typeOf() - FLOAT -> typeOf() // dataframe-jdbc - DOUBLE -> typeOf() // dataframe-jdbc - DECIMAL -> typeOf() // dataframe-jdbc - TIME -> typeOf() - TIME_WITH_TIME_ZONE -> typeOf() // dataframe-jdbc - DATE -> typeOf() - TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> typeOf() // dataframe-jdbc - TIMESTAMP_WITH_TIME_ZONE -> typeOf() // dataframe-jdbc - JSON -> typeOf() - BLOB -> typeOf() - UUID -> typeOf() - MAP -> { - val (key, value) = parseMapTypes(sqlTypeName) - Map::class.createType( - listOf( - KTypeProjection.invariant(key.toKType(false)), - KTypeProjection.invariant(value.toKType(true)), - ), - ) - } - - LIST, ARRAY -> { - // TODO requires #1266 and #1273 for specific types - // val listType = parseListType(sqlTypeName) - // Array::class.createType( - // listOf(KTypeProjection.invariant(listType.toKType(true))), - // ) - typeOf() - } + override fun makeCommonSqlToKTypeMapping(tableColumnMetadata: TableColumnMetadata): Nothing = + error("This function should not be called. Or exist, for that matter...") - STRUCT -> typeOf() // TODO requires #1266 for specific types - UNION -> typeOf() // Cannot handle this in Kotlin - VARCHAR -> typeOf() - UNKNOWN, BIT, INTERVAL, ENUM -> typeOf() - }.withNullability(isNullable) + /** + * TODO: I wanted to do the conversion here, but as I have no source type ánd target type + * it's impossible. + * It would be easier to do conversion on the entire column because we can borrow [DataColumn.convertTo]. + */ + override fun buildDataColumn( + name: String, + values: MutableList, + kType: KType, + inferNullability: Boolean, + ): DataColumn<*> { + val sourceType = kType + return super.buildDataColumn(name, values, kType, inferNullability) } - /** Parses "MAP(X, Y)" into "X" and "Y", taking parentheses into account */ - internal fun parseMapTypes(typeString: String): Pair { - if (!typeString.startsWith("MAP(") || !typeString.endsWith(")")) { - error("invalid MAP type: $typeString") - } - - val content = typeString.removeSurrounding("MAP(", ")") + override fun extractValueFromResultSet( + rs: ResultSet, + columnIndex: Int, + columnMetadata: TableColumnMetadata, + kType: KType, + ): Any? { + // TODO This '+ 1' is easily forgotten if I need to override this function to do any conversion + val result = rs.getObject(columnIndex + 1) - // Find the comma that separates key and value types - var parenCount = 0 - var commaIndex = -1 - for (i in content.indices) { - when (content[i]) { - '(' -> parenCount++ + // TODO: where is the [ColumnSchema] when I need it? + // Now I need to call my [parseDuckDbType] function again... + val parsedType = parseDuckDbType(columnMetadata.sqlTypeName, columnMetadata.isNullable) - ')' -> parenCount-- - - ',' -> if (parenCount == 0) { - commaIndex = i - break - } - } - } - - if (commaIndex == -1) error("invalid MAP type: $typeString") - val keyType = content.take(commaIndex).trim() - val valueType = content.substring(commaIndex + 1).trim() - return Pair(keyType, valueType) - } - - /** Parses "X[]" and "X[123]" into "X", and "X[][]" into "X[]" */ - internal fun parseListType(typeString: String): String { - if (!typeString.endsWith("]")) { - error("invalid LIST/ARRAY type: $typeString") - } - - return typeString.take(typeString.indexOfLast { it == '[' }) + // TODO doing it as a column + val convertedResult = parsedType.converter(columnOf(result)).single() + return convertedResult } /** @@ -228,7 +178,7 @@ public object DuckDb : DbType("duckdb") { * but supports read-only mode through connection parameters. * * @param [dbConfig] The database configuration containing URL, credentials, and read-only flag. - * @return A configured [java.sql.Connection] instance. + * @return A configured [Connection] instance. */ override fun createConnection(dbConfig: DbConnectionConfig): Connection { val properties = Properties().apply { @@ -256,3 +206,185 @@ public object DuckDb : DbType("duckdb") { private fun String.isInMemoryDuckDb(): Boolean = this.trim() == "jdbc:duckdb:" || matches("jdbc:duckdb:\\s*$".toRegex()) } + +/** + * How a column type from JDBC, [sqlTypeName], is read in Java/Kotlin. + * The returned type must exactly follow [ResultSet.getObject] of your specific database's JDBC driver. + * Returning `null` defer the implementation to the default one (which may not always be correct). + * + * Following [org.duckdb.DuckDBVector.getObject] and converting the result to + * + */ +internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): ParsedType = + when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { + BOOLEAN -> parsedTypeForValueColumnOf(isNullable) + + TINYINT -> parsedTypeForValueColumnOf(isNullable) + + SMALLINT -> parsedTypeForValueColumnOf(isNullable) + + INTEGER -> parsedTypeForValueColumnOf(isNullable) + + BIGINT -> parsedTypeForValueColumnOf(isNullable) + + HUGEINT -> parsedTypeForValueColumnOf(isNullable) + + UHUGEINT -> parsedTypeForValueColumnOf(isNullable) + + UTINYINT -> parsedTypeForValueColumnOf(isNullable) + + USMALLINT -> parsedTypeForValueColumnOf(isNullable) + + UINTEGER -> parsedTypeForValueColumnOf(isNullable) + + UBIGINT -> parsedTypeForValueColumnOf(isNullable) + + FLOAT -> parsedTypeForValueColumnOf(isNullable) + + DOUBLE -> parsedTypeForValueColumnOf(isNullable) + + DECIMAL -> parsedTypeForValueColumnOf(isNullable) + + // DataFrame can do this conversion + TIME -> parsedTypeForValueColumnOf(isNullable) { it.convertTo() } + + // todo? + TIME_WITH_TIME_ZONE -> parsedTypeForValueColumnOf(isNullable) + + // DataFrame can do this conversion + DATE -> parsedTypeForValueColumnOf(isNullable) { it.convertTo() } + + TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> + parsedTypeForValueColumnOf(isNullable) { + it.map { + it?.toInstant()?.toKotlinInstant() + }.asValueColumn().cast() + } + + // todo? + TIMESTAMP_WITH_TIME_ZONE -> parsedTypeForValueColumnOf(isNullable) + + // TODO! + JSON -> parsedTypeForValueColumnOf(isNullable) + + BLOB -> parsedTypeForValueColumnOf(isNullable) + + UUID -> parsedTypeForValueColumnOf(isNullable) { it.convertTo() } + + MAP -> { + val (key, value) = parseMapTypes(sqlTypeName) + val sourceMapType = Map::class.createType( + listOf( + KTypeProjection.invariant(parseDuckDbType(key, false).sourceType), + KTypeProjection.invariant(parseDuckDbType(value, true).sourceType), + ), + ) + val targetMapType = Map::class.createType( + listOf( + KTypeProjection.invariant(parseDuckDbType(key, false).targetSchema.type), + KTypeProjection.invariant(parseDuckDbType(value, true).targetSchema.type), + ), + ) + + ParsedType( + sourceType = sourceMapType, + targetSchema = ColumnSchema.Value(targetMapType), + converter = { it }, + ) + } + + LIST, ARRAY -> { + // TODO requires #1266 and #1273 for specific types + val listType = parseListType(sqlTypeName) + val parsedListType = parseDuckDbType(listType, true) + val targetListType = List::class.createType( + listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), + ) + // todo maybe List should become FrameColumn + ParsedType( + sourceType = typeOf(), + targetSchema = ColumnSchema.Value(targetListType), + converter = { it }, + ) + } + + // TODO requires #1266 for specific types + STRUCT -> parsedTypeForValueColumnOf(isNullable) + + // Cannot handle this in Kotlin + UNION -> parsedTypeForValueColumnOf(isNullable) + + VARCHAR -> parsedTypeForValueColumnOf(isNullable) + + UNKNOWN, BIT, INTERVAL, ENUM -> parsedTypeForValueColumnOf(isNullable) + } + +/** + * @property sourceType the source type of the column as read by [ResultSet.getObject] of our specific database's JDBC driver. + * @property targetSchema the target schema of the column. This can have a different [kType][ColumnSchema.type] than [sourceType]! + * If so, the values need to be converted in [DbType.buildDataColumn]. + * @property converter a function that converts the source column to the target column type + */ +internal data class ParsedType( + val sourceType: KType, + val targetSchema: ColumnSchema, + val converter: (DataColumn<*>) -> DataColumn<*>, +) + +internal inline fun parsedTypeForValueColumnOf(isNullable: Boolean): ParsedType { + val type = typeOf().withNullability(isNullable) + return ParsedType( + sourceType = type, + targetSchema = ColumnSchema.Value(type), + converter = { it }, + ) +} + +internal inline fun parsedTypeForValueColumnOf( + isNullable: Boolean, + noinline converter: (DataColumn) -> DataColumn, +): ParsedType = + ParsedType( + sourceType = typeOf().withNullability(isNullable), + targetSchema = ColumnSchema.Value(typeOf().withNullability(isNullable)), + converter = converter as (DataColumn<*>) -> DataColumn<*>, + ) + +/** Parses "MAP(X, Y)" into "X" and "Y", taking parentheses into account */ +internal fun parseMapTypes(typeString: String): Pair { + if (!typeString.startsWith("MAP(") || !typeString.endsWith(")")) { + error("invalid MAP type: $typeString") + } + + val content = typeString.removeSurrounding("MAP(", ")") + + // Find the comma that separates key and value types + var parenCount = 0 + var commaIndex = -1 + for (i in content.indices) { + when (content[i]) { + '(' -> parenCount++ + + ')' -> parenCount-- + + ',' -> if (parenCount == 0) { + commaIndex = i + break + } + } + } + + if (commaIndex == -1) error("invalid MAP type: $typeString") + val keyType = content.take(commaIndex).trim() + val valueType = content.substring(commaIndex + 1).trim() + return Pair(keyType, valueType) +} + +/** Parses "X[]" and "X[123]" into "X", and "X[][]" into "X[]" */ +internal fun parseListType(typeString: String): String { + if (!typeString.endsWith("]")) { + error("invalid LIST/ARRAY type: $typeString") + } + + return typeString.take(typeString.indexOfLast { it == '[' }) +} diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 6b0513fb39..91dd09576d 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -54,7 +54,7 @@ android-gradle-api = "7.3.1" # need to revise our tests to update ktor = "3.0.1" # needs jupyter compatibility with Kotlin 2.1 to update kotlin-compile-testing = "0.7.1" hikari = "7.0.2" -duckdb = "1.3.1.0" +duckdb = "1.4.2.0" buildconfig = "5.6.7" benchmark = "0.4.12" From 210af021e0bd6cfeb5ae80e654a8e66aac0d8f96 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 10 Dec 2025 16:41:15 +0100 Subject: [PATCH 02/14] Refactored DbType to use `DbColumnTypeInformation`, generated from `TableColumnMetadata` in `generateTypeInformation()`. This contains potential pre- and post-processing logic for any type --- .../io/db/DbColumnTypeInformation.kt | 133 ++++++ .../kotlinx/dataframe/io/db/DbType.kt | 428 +++++++++--------- .../jetbrains/kotlinx/dataframe/io/db/H2.kt | 10 +- .../kotlinx/dataframe/io/db/MariaDb.kt | 44 +- .../kotlinx/dataframe/io/db/MsSql.kt | 4 - .../kotlinx/dataframe/io/db/MySql.kt | 24 +- .../kotlinx/dataframe/io/db/PostgreSql.kt | 27 +- .../kotlinx/dataframe/io/db/Sqlite.kt | 4 - .../dataframe/io/readDataFrameSchema.kt | 11 +- .../kotlinx/dataframe/io/readJdbc.kt | 141 ++++-- 10 files changed, 489 insertions(+), 337 deletions(-) create mode 100644 dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt new file mode 100644 index 0000000000..d74c3364d2 --- /dev/null +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt @@ -0,0 +1,133 @@ +package org.jetbrains.kotlinx.dataframe.io.db + +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema + +public typealias AnyDbColumnTypeInformation = DbColumnTypeInformation<*, *, *> + +/** + * Represents all type information that can be retrieved from an SQL column. + * This can be extended for your specific [DbType2] if you need extra information. + * + * @property targetSchema the target schema of the column after running the optional + * [valuePreprocessor] and [columnPostprocessor]. + * @property valuePreprocessor an optional function that converts values from [java.sql.ResultSet.getObject] + * to a cell/row suitable to be put into a [org.jetbrains.kotlinx.dataframe.DataColumn]. + * @property columnPostprocessor an optional function that converts a [org.jetbrains.kotlinx.dataframe.DataColumn] with values of type [D] + * to a [org.jetbrains.kotlinx.dataframe.DataColumn] of with values of type [P]. + */ +public open class DbColumnTypeInformation( + public open val columnMetadata: TableColumnMetadata, + public open val targetSchema: ColumnSchema, + public open val valuePreprocessor: DbValuePreprocessor?, + public open val columnPostprocessor: DbColumnPostprocessor?, +) { + public open fun preprocess(value: J): D { + valuePreprocessor?.let { valuePreprocessor -> + return valuePreprocessor.preprocess(value, this) + } + return value as D + } + + public open fun postprocess(column: DataColumn): DataColumn

{ + columnPostprocessor?.let { columnPostprocessor -> + return columnPostprocessor.postprocess(column, this) + } + return column.cast() + } +} + +public fun DbColumnTypeInformation<*, *, *>.cast(): DbColumnTypeInformation = + this as DbColumnTypeInformation + +public fun dbColumnTypeInformation( + columnMetadata: TableColumnMetadata, + targetSchema: ColumnSchema, +): DbColumnTypeInformation = + DbColumnTypeInformation( + columnMetadata = columnMetadata, + targetSchema = targetSchema, + valuePreprocessor = null, + columnPostprocessor = null, + ) + +public fun dbColumnTypeInformationWithPreprocessing( + columnMetadata: TableColumnMetadata, + targetSchema: ColumnSchema, + valuePreprocessor: DbValuePreprocessor?, +): DbColumnTypeInformation = + DbColumnTypeInformation( + columnMetadata = columnMetadata, + targetSchema = targetSchema, + valuePreprocessor = valuePreprocessor, + columnPostprocessor = null, + ) + +public fun dbColumnTypeInformationWithPostprocessing( + columnMetadata: TableColumnMetadata, + targetSchema: ColumnSchema, + columnPostprocessor: DbColumnPostprocessor?, +): DbColumnTypeInformation = + DbColumnTypeInformation( + columnMetadata = columnMetadata, + targetSchema = targetSchema, + valuePreprocessor = null, + columnPostprocessor = columnPostprocessor, + ) + +public fun dbColumnTypeInformation( + columnMetadata: TableColumnMetadata, + targetSchema: ColumnSchema, + valuePreprocessor: DbValuePreprocessor?, + columnPostprocessor: DbColumnPostprocessor?, +): DbColumnTypeInformation = + DbColumnTypeInformation( + columnMetadata = columnMetadata, + targetSchema = targetSchema, + valuePreprocessor = valuePreprocessor, + columnPostprocessor = columnPostprocessor, + ) + +/** + * This preprocessor can be created for types where you want to convert the values + * coming from [java.sql.ResultSet.getObject] to a different type more suitable to be put in a [DataColumn] + * + * @param J the type of the value coming from the JDBC driver. + * @param D the type of the column values after preprocessing. + */ +public fun interface DbValuePreprocessor { + + /** + * Converts the given [jdbcValue]: [J] to a [D]. + * + * If you intend to create a [org.jetbrains.kotlinx.dataframe.columns.ColumnGroup], + * return a [org.jetbrains.kotlinx.dataframe.DataRow] here. + * + * If you intend to create a [org.jetbrains.kotlinx.dataframe.columns.FrameColumn], + * return a [org.jetbrains.kotlinx.dataframe.DataFrame] here. + */ + public fun preprocess( + jdbcValue: J, + dbColumnTypeInformation: DbColumnTypeInformation<@UnsafeVariance J, @UnsafeVariance D, *>, + ): D +} + +public fun DbValuePreprocessor<*, *>.cast(): DbValuePreprocessor = this as DbValuePreprocessor + +/** + * @param D the type of the column values before postprocessing. + * @param P the type of the column values after postprocessing. + */ +public fun interface DbColumnPostprocessor { + + /** + * Converts the given [column]: [DataColumn] with values of type [D] to a [DataColumn] of with values of type [P]. + */ + public fun postprocess( + column: DataColumn, + dbColumnTypeInformation: DbColumnTypeInformation<*, @UnsafeVariance D, @UnsafeVariance P>, + ): DataColumn

+} + +public fun DbColumnPostprocessor<*, *>.cast(): DbColumnPostprocessor = this as DbColumnPostprocessor diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index e4fb482d8f..baaa756228 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -1,7 +1,16 @@ package org.jetbrains.kotlinx.dataframe.io.db +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.asDataColumn +import org.jetbrains.kotlinx.dataframe.api.asValueColumn +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.toColumn +import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.columns.ValueColumn import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema @@ -26,12 +35,12 @@ import java.time.OffsetDateTime import java.time.OffsetTime import java.util.Date import java.util.UUID +import kotlin.collections.toTypedArray import kotlin.reflect.KClass import kotlin.reflect.KType -import kotlin.reflect.full.createType -import kotlin.reflect.full.isSupertypeOf import kotlin.reflect.full.safeCast -import kotlin.reflect.full.starProjectedType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf /** * The `DbType` class represents a database type used for reading dataframe from the database. @@ -39,6 +48,7 @@ import kotlin.reflect.full.starProjectedType * @property [dbTypeInJdbcUrl] The name of the database as specified in the JDBC URL. */ public abstract class DbType(public val dbTypeInJdbcUrl: String) { + /** * Represents the JDBC driver class name for a given database type. * @@ -82,10 +92,162 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { */ public open val defaultQueryTimeout: Int? = null // null = no timeout + /** Default mapping of [Java SQL Types][Types] to [KType]. */ + protected val defaultJdbcTypeToKTypeMapping: Map = mapOf( + Types.BIT to typeOf(), + Types.TINYINT to typeOf(), + Types.SMALLINT to typeOf(), + Types.INTEGER to typeOf(), + Types.BIGINT to typeOf(), + Types.FLOAT to typeOf(), + Types.REAL to typeOf(), + Types.DOUBLE to typeOf(), + Types.NUMERIC to typeOf(), + Types.DECIMAL to typeOf(), + Types.CHAR to typeOf(), + Types.VARCHAR to typeOf(), + Types.LONGVARCHAR to typeOf(), + Types.DATE to typeOf(), + Types.TIME to typeOf

= dbColumnTypeInformation.postprocess(column) /** * Checks if the given table name is a system table for the specified database type. @@ -103,14 +265,6 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { */ public abstract fun buildTableMetadata(tables: ResultSet): TableMetadata - /** - * Converts SQL data type to a Kotlin data type. - * - * @param [tableColumnMetadata] The metadata of the table column. - * @return The corresponding Kotlin data type, or null if no mapping is found. - */ - public abstract fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType? - /** * Builds a SELECT query for reading from a table. * @@ -196,213 +350,6 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { return connection } - /** - * Extracts a value from the ResultSet for the given column. - * This method can be overridden by custom database types to provide specialized parsing logic. - * - * @param [rs] the ResultSet to read from - * @param [columnIndex] zero-based column index - * @param [columnMetadata] metadata for the column - * @param [kType] the Kotlin type for this column - * @return the extracted value, or null - */ - public open fun extractValueFromResultSet( - rs: ResultSet, - columnIndex: Int, - columnMetadata: TableColumnMetadata, - kType: KType, - ): Any? = - try { - rs.getObject(columnIndex + 1) - // TODO: add a special handler for Blob via Streams - } catch (_: Throwable) { - // TODO: expand for all the types like in generateKType function - if (kType.isSupertypeOf(String::class.starProjectedType)) { - rs.getString(columnIndex + 1) - } else { - rs.getString(columnIndex + 1) - } - } - - /** - * Builds a single DataColumn with proper type handling. - * Accepts a mutable list to allow efficient post-processing. - */ - public open fun buildDataColumn( - name: String, - values: MutableList, - kType: KType, - inferNullability: Boolean, - ): DataColumn<*> { - val correctedValues = postProcessColumnValues(values, kType) - - return DataColumn.createValueColumn( - name = name, - values = correctedValues, - infer = convertNullabilityInference(inferNullability), - type = kType, - ) - } - - private fun convertNullabilityInference(inferNullability: Boolean) = - if (inferNullability) Infer.Nulls else Infer.None - - /** - * Processes the column values retrieved from the database and performs transformations based on the provided - * Kotlin type and column metadata. The method allows for custom post-processing logic, such as handling - * specific database column types, including arrays. - * - * @param values the list of raw values retrieved from the database for the column. - * @param kType the Kotlin type that the column values should be transformed to. - * @return a list of processed column values, with transformations applied where necessary, or the original list if no transformation is needed. - */ - private fun postProcessColumnValues(values: MutableList, kType: KType): List = - when { - /* EXAMPLE: columnMetadata.sqlTypeName == "MY_CUSTOM_ARRAY" -> { - values.map { /* custom transformation */ } - } */ - kType.classifier == Array::class -> { - handleArrayValues(values) - } - - else -> values - } - - /** - * Converts SQL Array objects to strongly-typed arrays. - * - * Extracts arrays from SQL Array objects and converts them to a consistent type - * if all elements share the same type. Returns original arrays if types vary. - * - * @param values raw values containing SQL Array objects - * @return list of consistently typed arrays, or original arrays if no common type exists - */ - private fun handleArrayValues(values: MutableList): List { - // Intermediate variable for the first mapping - val sqlArrays = values.mapNotNull { - (it as? java.sql.Array)?.array?.let { array -> array as? Array<*> } - } - - // Flatten the arrays to iterate through all elements and filter out null values, then map to component types - val allElementTypes = sqlArrays - .flatMap { array -> - (array.javaClass.componentType?.kotlin?.let { listOf(it) } ?: emptyList()) - } // Get the component type of each array and convert it to a Kotlin class, if available - - // Find distinct types and ensure there's only one distinct type - val commonElementType = allElementTypes - .distinct() // Get unique element types - .singleOrNull() // Ensure there's only one unique element type, otherwise return null - ?: Any::class // Fallback to Any::class if multiple distinct types or no elements found - - return if (commonElementType != Any::class) { - sqlArrays.map { castArray(it, commonElementType).toTypedArray() } - } else { - sqlArrays - } - } - - /** Utility function to cast arrays based on the type of elements */ - private fun castArray(array: Array<*>, elementType: KClass): List = - array.mapNotNull { elementType.safeCast(it) } - - /** - * Creates a mapping between common SQL types and their corresponding KTypes. - * - * @param tableColumnMetadata The metadata of the table column. - * @return The KType associated with the SQL type or a default type if no mapping is found. - */ - public open fun makeCommonSqlToKTypeMapping(tableColumnMetadata: TableColumnMetadata): KType { - val jdbcTypeToKTypeMapping = mapOf( - Types.BIT to Boolean::class, - Types.TINYINT to Int::class, - Types.SMALLINT to Int::class, - Types.INTEGER to Int::class, - Types.BIGINT to Long::class, - Types.FLOAT to Float::class, - Types.REAL to Float::class, - Types.DOUBLE to Double::class, - Types.NUMERIC to BigDecimal::class, - Types.DECIMAL to BigDecimal::class, - Types.CHAR to String::class, - Types.VARCHAR to String::class, - Types.LONGVARCHAR to String::class, - Types.DATE to Date::class, - Types.TIME to Time::class, - Types.TIMESTAMP to Timestamp::class, - Types.BINARY to ByteArray::class, - Types.VARBINARY to ByteArray::class, - Types.LONGVARBINARY to ByteArray::class, - Types.NULL to String::class, - Types.JAVA_OBJECT to Any::class, - Types.DISTINCT to Any::class, - Types.STRUCT to Any::class, - Types.ARRAY to Array::class, - Types.BLOB to ByteArray::class, - Types.CLOB to Clob::class, - Types.REF to Ref::class, - Types.DATALINK to Any::class, - Types.BOOLEAN to Boolean::class, - Types.ROWID to RowId::class, - Types.NCHAR to String::class, - Types.NVARCHAR to String::class, - Types.LONGNVARCHAR to String::class, - Types.NCLOB to NClob::class, - Types.SQLXML to SQLXML::class, - Types.REF_CURSOR to Ref::class, - Types.TIME_WITH_TIMEZONE to OffsetTime::class, - Types.TIMESTAMP_WITH_TIMEZONE to OffsetDateTime::class, - ) - - fun determineKotlinClass(tableColumnMetadata: TableColumnMetadata): KClass<*> = - when { - tableColumnMetadata.jdbcType == Types.OTHER -> when (tableColumnMetadata.javaClassName) { - "[B" -> ByteArray::class - else -> Any::class - } - - tableColumnMetadata.javaClassName == "[B" -> ByteArray::class - - tableColumnMetadata.javaClassName == "java.sql.Blob" -> Blob::class - - tableColumnMetadata.jdbcType == Types.TIMESTAMP && - tableColumnMetadata.javaClassName == "java.time.LocalDateTime" -> LocalDateTime::class - - tableColumnMetadata.jdbcType == Types.BINARY && - tableColumnMetadata.javaClassName == "java.util.UUID" -> UUID::class - - tableColumnMetadata.jdbcType == Types.REAL && - tableColumnMetadata.javaClassName == "java.lang.Double" -> Double::class - - tableColumnMetadata.jdbcType == Types.FLOAT && - tableColumnMetadata.javaClassName == "java.lang.Double" -> Double::class - - tableColumnMetadata.jdbcType == Types.NUMERIC && - tableColumnMetadata.javaClassName == "java.lang.Double" -> Double::class - - // Force BIGINT to always be Long, regardless of javaClassName - // Some JDBC drivers (e.g., MariaDB) may report Integer for small BIGINT values - // TODO: tableColumnMetadata.jdbcType == Types.BIGINT -> Long::class - - else -> jdbcTypeToKTypeMapping[tableColumnMetadata.jdbcType] ?: String::class - } - - fun createArrayTypeIfNeeded(kClass: KClass<*>, isNullable: Boolean): KType = - if (kClass == Array::class) { - val typeParam = kClass.typeParameters[0].createType() - kClass.createType( - arguments = listOf(kotlin.reflect.KTypeProjection.invariant(typeParam)), - nullable = isNullable, - ) - } else { - kClass.createType(nullable = isNullable) - } - - val kClass: KClass<*> = determineKotlinClass(tableColumnMetadata) - val kType = createArrayTypeIfNeeded(kClass, tableColumnMetadata.isNullable) - return kType - } - /** * Retrieves column metadata from a JDBC ResultSet. * @@ -415,7 +362,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { * - `getTableName()` → extract from column name if contains '.' → `null` * - `isNullable()` → [DatabaseMetaData.getColumns] → `true` (assume nullable) * - `getColumnTypeName()` → `"OTHER"` - * - `getColumnType()` → [java.sql.Types.OTHER] + * - `getColumnType()` → [Types.OTHER] * - `getColumnDisplaySize()` → `0` * - `getColumnClassName()` → `"java.lang.Object"` * @@ -558,4 +505,43 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { return name } + + /** + * todo? + * Converts SQL Array objects to strongly-typed arrays. + * + * Extracts arrays from SQL Array objects and converts them to a consistent type + * if all elements share the same type. Returns original arrays if types vary. + * + * @param values raw values containing SQL Array objects + * @return list of consistently typed arrays, or original arrays if no common type exists + */ + private fun handleArrayValues(values: ValueColumn): DataColumn { + // Intermediate variable for the first mapping + val sqlArrays = values.values().mapNotNull { + (it as? java.sql.Array)?.array?.let { array -> array as? Array<*> } + } + + // Flatten the arrays to iterate through all elements and filter out null values, then map to component types + val allElementTypes = sqlArrays + .flatMap { array -> + (array.javaClass.componentType?.kotlin?.let { listOf(it) } ?: emptyList()) + } // Get the component type of each array and convert it to a Kotlin class, if available + + // Find distinct types and ensure there's only one distinct type + val commonElementType = allElementTypes + .distinct() // Get unique element types + .singleOrNull() // Ensure there's only one unique element type, otherwise return null + ?: Any::class // Fallback to Any::class if multiple distinct types or no elements found + + return if (commonElementType != Any::class) { + sqlArrays.map { castArray(it, commonElementType).toTypedArray() } + } else { + sqlArrays + }.toColumn(values.name()) + } + + /** Utility function to cast arrays based on the type of elements */ + private fun castArray(array: Array<*>, elementType: KClass): List = + array.mapNotNull { elementType.safeCast(it) } } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt index 96cea43724..98f58d0350 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt @@ -1,9 +1,7 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.ResultSet import java.util.Locale -import kotlin.reflect.KType import org.jetbrains.kotlinx.dataframe.io.db.MariaDb as MariaDbType import org.jetbrains.kotlinx.dataframe.io.db.MsSql as MsSqlType import org.jetbrains.kotlinx.dataframe.io.db.MySql as MySqlType @@ -119,8 +117,9 @@ public open class H2(public val mode: Mode = Mode.Regular) : DbType("h2") { override val driverClassName: String get() = "org.h2.Driver" - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema? = - delegate?.convertSqlTypeToColumnSchemaValue(tableColumnMetadata) + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation = + delegate?.generateTypeInformation(tableColumnMetadata) + ?: super.generateTypeInformation(tableColumnMetadata) override fun isSystemTable(tableMetadata: TableMetadata): Boolean { val locale = Locale.getDefault() @@ -146,9 +145,6 @@ public open class H2(public val mode: Mode = Mode.Regular) : DbType("h2") { tables.getString("table_cat"), ) - override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType? = - delegate?.convertSqlTypeToKType(tableColumnMetadata) - public override fun buildSqlQueryWithLimit(sqlQuery: String, limit: Int): String = delegate?.buildSqlQueryWithLimit(sqlQuery, limit) ?: super.buildSqlQueryWithLimit(sqlQuery, limit) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt index d91b60a0b2..172c6ee2f1 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt @@ -1,11 +1,9 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.io.db.TableColumnMetadata -import org.jetbrains.kotlinx.dataframe.io.db.TableMetadata import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.ResultSet -import kotlin.reflect.KType -import kotlin.reflect.full.createType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf /** * Represents the MariaDb database type. @@ -17,7 +15,7 @@ public object MariaDb : DbType("mariadb") { override val driverClassName: String get() = "org.mariadb.jdbc.Driver" - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema? { + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { // Force BIGINT to always be Long, regardless of javaClassName // MariaDB JDBC driver may report Integer for small BIGINT values // TODO: investigate the corner case @@ -30,15 +28,21 @@ public object MariaDb : DbType("mariadb") { if (tableColumnMetadata.sqlTypeName == "INTEGER UNSIGNED" || tableColumnMetadata.sqlTypeName == "INT UNSIGNED" ) { - val kType = Long::class.createType(nullable = tableColumnMetadata.isNullable) - return ColumnSchema.Value(kType) + val kType = typeOf().withNullability(tableColumnMetadata.isNullable) + return dbColumnTypeInformation( + columnMetadata = tableColumnMetadata, + targetSchema = ColumnSchema.Value(kType), + ) } if (tableColumnMetadata.sqlTypeName == "SMALLINT" && tableColumnMetadata.javaClassName == "java.lang.Short") { - val kType = Short::class.createType(nullable = tableColumnMetadata.isNullable) - return ColumnSchema.Value(kType) + val kType = typeOf().withNullability(tableColumnMetadata.isNullable) + return dbColumnTypeInformation( + columnMetadata = tableColumnMetadata, + targetSchema = ColumnSchema.Value(kType), + ) } - return null + return super.generateTypeInformation(tableColumnMetadata) } override fun isSystemTable(tableMetadata: TableMetadata): Boolean = MySql.isSystemTable(tableMetadata) @@ -50,26 +54,6 @@ public object MariaDb : DbType("mariadb") { tables.getString("table_cat"), ) - override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType? { - // Force BIGINT to always be Long, regardless of javaClassName - // MariaDB JDBC driver may report Integer for small BIGINT values - // TODO: investigate the corner case - // if (tableColumnMetadata.jdbcType == java.sql.Types.BIGINT) { - // return Long::class.createType(nullable = tableColumnMetadata.isNullable) - // } - - if (tableColumnMetadata.sqlTypeName == "INTEGER UNSIGNED" || - tableColumnMetadata.sqlTypeName == "INT UNSIGNED" - ) { - return Long::class.createType(nullable = tableColumnMetadata.isNullable) - } - - if (tableColumnMetadata.sqlTypeName == "SMALLINT" && tableColumnMetadata.javaClassName == "java.lang.Short") { - return Short::class.createType(nullable = tableColumnMetadata.isNullable) - } - return null - } - override fun quoteIdentifier(name: String): String { // schema.table -> `schema`.`table` return name.split(".").joinToString(".") { "`$it`" } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MsSql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MsSql.kt index 2709e04b6a..78f3c9c5a7 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MsSql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MsSql.kt @@ -17,8 +17,6 @@ public object MsSql : DbType("sqlserver") { override val driverClassName: String get() = "com.microsoft.sqlserver.jdbc.SQLServerDriver" - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema? = null - override fun isSystemTable(tableMetadata: TableMetadata): Boolean { val locale = Locale.getDefault() @@ -47,8 +45,6 @@ public object MsSql : DbType("sqlserver") { tables.getString("table_cat"), ) - override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType? = null - public override fun buildSqlQueryWithLimit(sqlQuery: String, limit: Int): String = sqlQuery.replace("SELECT", "SELECT TOP $limit", ignoreCase = true) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt index e411345879..8f17d99a6a 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt @@ -1,12 +1,10 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.io.db.TableColumnMetadata -import org.jetbrains.kotlinx.dataframe.io.db.TableMetadata import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.ResultSet import java.util.Locale -import kotlin.reflect.KType -import kotlin.reflect.full.createType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf /** * Represents the MySql database type. @@ -18,12 +16,15 @@ public object MySql : DbType("mysql") { override val driverClassName: String get() = "com.mysql.jdbc.Driver" - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema? { + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { if (tableColumnMetadata.sqlTypeName == "INT UNSIGNED") { - val kType = Long::class.createType(nullable = tableColumnMetadata.isNullable) - return ColumnSchema.Value(kType) + val kType = typeOf().withNullability(tableColumnMetadata.isNullable) + return dbColumnTypeInformation( + columnMetadata = tableColumnMetadata, + targetSchema = ColumnSchema.Value(kType), + ) } - return null + return super.generateTypeInformation(tableColumnMetadata) } override fun isSystemTable(tableMetadata: TableMetadata): Boolean { @@ -49,13 +50,6 @@ public object MySql : DbType("mysql") { tables.getString("table_cat"), ) - override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType? { - if (tableColumnMetadata.sqlTypeName == "INT UNSIGNED") { - return Long::class.createType(nullable = tableColumnMetadata.isNullable) - } - return null - } - override fun quoteIdentifier(name: String): String { // schema.table -> `schema`.`table` return name.split(".").joinToString(".") { "`$it`" } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt index 8da1a66833..fa5ea022ef 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt @@ -1,12 +1,10 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.io.db.TableColumnMetadata -import org.jetbrains.kotlinx.dataframe.io.db.TableMetadata import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.ResultSet import java.util.Locale -import kotlin.reflect.KType -import kotlin.reflect.full.createType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf /** * Represents the PostgreSql database type. @@ -18,14 +16,16 @@ public object PostgreSql : DbType("postgresql") { override val driverClassName: String get() = "org.postgresql.Driver" - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema? { - // TODO: could be a wrapper of convertSqlTypeToKType + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { // because of https://github.com/pgjdbc/pgjdbc/issues/425 if (tableColumnMetadata.sqlTypeName == "money") { - val kType = String::class.createType(nullable = tableColumnMetadata.isNullable) - return ColumnSchema.Value(kType) + val kType = typeOf().withNullability(tableColumnMetadata.isNullable) + return dbColumnTypeInformation( + columnMetadata = tableColumnMetadata, + targetSchema = ColumnSchema.Value(kType), + ) } - return null + return super.generateTypeInformation(tableColumnMetadata) } override fun isSystemTable(tableMetadata: TableMetadata): Boolean = @@ -39,15 +39,6 @@ public object PostgreSql : DbType("postgresql") { tables.getString("table_cat"), ) - override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType? { - // because of https://github.com/pgjdbc/pgjdbc/issues/425 - if (tableColumnMetadata.sqlTypeName == "money") { - return String::class.createType(nullable = tableColumnMetadata.isNullable) - } - - return null - } - override fun quoteIdentifier(name: String): String { // schema.table -> "schema"."table" return name.split(".").joinToString(".") { "\"$it\"" } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/Sqlite.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/Sqlite.kt index adb03b2753..1073e12be5 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/Sqlite.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/Sqlite.kt @@ -20,8 +20,6 @@ public object Sqlite : DbType("sqlite") { override val driverClassName: String get() = "org.sqlite.JDBC" - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema? = null - override fun isSystemTable(tableMetadata: TableMetadata): Boolean = tableMetadata.name.startsWith("sqlite_") override fun buildTableMetadata(tables: ResultSet): TableMetadata = @@ -31,8 +29,6 @@ public object Sqlite : DbType("sqlite") { tables.getString("TABLE_CAT"), ) - override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType? = null - override fun createConnection(dbConfig: DbConnectionConfig): Connection = if (dbConfig.readOnly) { val config = SQLiteConfig() diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt index acce35f9fc..4684db0567 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt @@ -451,14 +451,7 @@ internal fun buildSchemaByTableColumns( dbType: DbType, ): DataFrameSchema { val schemaColumns = tableColumns.associate { - Pair(it.name, generateColumnSchemaValue(dbType, it)) + it.name to dbType.generateTypeInformation(it).targetSchema } - - return DataFrameSchemaImpl( - columns = schemaColumns, - ) + return DataFrameSchemaImpl(columns = schemaColumns) } - -internal fun generateColumnSchemaValue(dbType: DbType, tableColumnMetadata: TableColumnMetadata): ColumnSchema = - dbType.convertSqlTypeToColumnSchemaValue(tableColumnMetadata) - ?: ColumnSchema.Value(dbType.makeCommonSqlToKTypeMapping(tableColumnMetadata)) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt index b565ad3d5b..e5139b06cc 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt @@ -1,19 +1,28 @@ package org.jetbrains.kotlinx.dataframe.io import io.github.oshai.kotlinlogging.KotlinLogging +import org.jetbrains.kotlinx.dataframe.AnyCol import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.isColumnGroup +import org.jetbrains.kotlinx.dataframe.api.isFrameColumn +import org.jetbrains.kotlinx.dataframe.api.isValueColumn +import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.io.db.AnyDbColumnTypeInformation import org.jetbrains.kotlinx.dataframe.io.db.DbType import org.jetbrains.kotlinx.dataframe.io.db.TableColumnMetadata +import org.jetbrains.kotlinx.dataframe.io.db.cast import org.jetbrains.kotlinx.dataframe.io.db.extractDBTypeFromConnection +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.Connection import java.sql.DatabaseMetaData import java.sql.DriverManager import java.sql.PreparedStatement import java.sql.ResultSet import javax.sql.DataSource -import kotlin.reflect.KType +import kotlin.reflect.KClass +import kotlin.reflect.full.isSubclassOf private val logger = KotlinLogging.logger {} @@ -872,9 +881,21 @@ internal fun fetchAndConvertDataFromResultSet( limit: Int?, inferNullability: Boolean, ): AnyFrame { - val columnKTypes = buildColumnKTypes(tableColumns, dbType) - val columnData = readAllRowsFromResultSet(rs, tableColumns, columnKTypes, dbType, limit) - val dataFrame = buildDataFrameFromColumnData(columnData, tableColumns, columnKTypes, dbType, inferNullability) + val columnTypeInformation = buildColumnTypeInformation(tableColumns = tableColumns, dbType = dbType) + val columnData = readAllRowsFromResultSet( + rs = rs, + tableColumns = tableColumns, + columnTypeInformation = columnTypeInformation, + dbType = dbType, + limit = limit, + ) + val dataFrame = buildDataFrameFromColumnData( + columnData = columnData, + tableColumns = tableColumns, + columnTypeInformation = columnTypeInformation, + dbType = dbType, + inferNullability = inferNullability, + ) logger.debug { "DataFrame with ${dataFrame.rowsCount()} rows and ${dataFrame.columnsCount()} columns created as a result of SQL query." @@ -886,35 +907,38 @@ internal fun fetchAndConvertDataFromResultSet( /** * Builds a map of column indices to their Kotlin types. */ -private fun buildColumnKTypes(tableColumns: List, dbType: DbType): Map = - tableColumns.indices.associateWith { index -> - generateKType(dbType, tableColumns[index]) +private fun buildColumnTypeInformation( + tableColumns: List, + dbType: DbType, +): List = + tableColumns.indices.map { index -> + dbType.generateTypeInformation(tableColumns[index]) } /** * Reads all rows from ResultSet and returns a column-oriented data structure. - * Returns mutable lists to allow efficient post-processing without copying. */ private fun readAllRowsFromResultSet( rs: ResultSet, tableColumns: List, - columnKTypes: Map, + columnTypeInformation: List, dbType: DbType, limit: Int?, -): List> { +): List> { val columnsCount = tableColumns.size val columnData = List(columnsCount) { mutableListOf() } var rowsRead = 0 while (rs.next() && (limit == null || rowsRead < limit)) { repeat(columnsCount) { columnIndex -> - val value = dbType.extractValueFromResultSet( + val typeInformation = columnTypeInformation[columnIndex].cast() + val value = dbType.getValueFromResultSet( rs = rs, columnIndex = columnIndex, - columnMetadata = tableColumns[columnIndex], - kType = columnKTypes.getValue(columnIndex), + typeInformation = typeInformation, ) - columnData[columnIndex].add(value) + val preprocessedValue = typeInformation.preprocess(value) + columnData[columnIndex].add(preprocessedValue) } rowsRead++ // if (rowsRead % 1000 == 0) logger.debug { "Loaded $rowsRead rows." } // TODO: https://github.com/Kotlin/dataframe/issues/455 @@ -928,29 +952,88 @@ private fun readAllRowsFromResultSet( * Accepts mutable lists to enable efficient in-place transformations. */ private fun buildDataFrameFromColumnData( - columnData: List>, + columnData: List>, tableColumns: List, - columnKTypes: Map, + columnTypeInformation: List, dbType: DbType, inferNullability: Boolean, + checkSchema: Boolean = true, // TODO add as configurable parameter ): AnyFrame = columnData.mapIndexed { index, values -> - dbType.buildDataColumn( + val typeInformation = columnTypeInformation[index].cast() + val column = dbType.buildDataColumn( name = tableColumns[index].name, values = values, - kType = columnKTypes.getValue(index), + typeInformation = typeInformation, inferNullability = inferNullability, ) + val postProcessedColumn = typeInformation.postprocess(column) + + if (checkSchema) { + postProcessedColumn.checkSchema(typeInformation.targetSchema) + } + + postProcessedColumn }.toDataFrame() -/** - * Generates a KType based on the given database type and table column metadata. - * - * @param dbType The database type. - * @param tableColumnMetadata The table column metadata. - * - * @return The generated KType. - */ -internal fun generateKType(dbType: DbType, tableColumnMetadata: TableColumnMetadata): KType = - dbType.convertSqlTypeToKType(tableColumnMetadata) - ?: dbType.makeCommonSqlToKTypeMapping(tableColumnMetadata) +private fun AnyCol.checkSchema(expected: ColumnSchema) { + when (expected) { + is ColumnSchema.Value -> { + require(this.isValueColumn()) { + """ + Found mismatching schema for column '${this.name()}'. + Column ${this.name()} is expected to be a value column of type ${expected.type} but it is ${this.type()}. + """.trimIndent() + } + require(values().all { it == null || it::class.isSubclassOf(expected.type.classifier as KClass<*>) }) { + """ + Found mismatching type for value column '${this.name()}'. + Expected type: ${expected.type} + Actual types: ${values().map { it?.javaClass?.name ?: "null" }.distinct()} + """.trimIndent() + } + } + + is ColumnSchema.Group -> { + require(this.isColumnGroup()) { + """ + Found mismatching schema for column '${name()}'. + Column ${this.name()} is expected to be a column group but it is ${this.type()}. + """.trimIndent() + } + require(expected.schema.compare(this.schema()).isSuperOrMatches()) { + """ + Found mismatching schema for column group '${name()}'. + Expected schema: + ${expected.schema} + + Actual schema: + ${this.schema()} + """.trimIndent() + } + } + + is ColumnSchema.Frame -> { + require(this.isFrameColumn()) { + """ + Found mismatching schema for column '${this.name()}'. + Column ${this.name()} is expected to be a frame column but it is ${this.type()}. + """.trimIndent() + } + require(values().all { expected.schema.compare(it.schema()).isSuperOrMatches() }) { + """ + Found mismatching schema for frame column '${this.name()}'. + Expected schema: + ${expected.schema} + + Actual (deviating) schemas: + ${ + values().map { it.schema() } + .distinct() + .filterNot { expected.schema.compare(it).isSuperOrMatches() } + } + """.trimIndent() + } + } + } +} From d21dc7993449ea84454290a63108cba81e10ee71 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 10 Dec 2025 17:24:07 +0100 Subject: [PATCH 03/14] converted DuckDb to new preprocessing DbType. Turns out I might need recursive preprocessing --- .../io/db/DbColumnTypeInformation.kt | 21 +- .../kotlinx/dataframe/io/db/DbType.kt | 5 +- .../kotlinx/dataframe/io/db/DuckDb.kt | 434 +++++++++--------- .../kotlinx/dataframe/io/db/MariaDb.kt | 10 +- .../kotlinx/dataframe/io/db/MySql.kt | 5 +- .../kotlinx/dataframe/io/db/PostgreSql.kt | 5 +- .../kotlinx/dataframe/io/readJdbc.kt | 4 +- 7 files changed, 220 insertions(+), 264 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt index d74c3364d2..3db5f9f066 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt @@ -18,16 +18,15 @@ public typealias AnyDbColumnTypeInformation = DbColumnTypeInformation<*, *, *> * to a [org.jetbrains.kotlinx.dataframe.DataColumn] of with values of type [P]. */ public open class DbColumnTypeInformation( - public open val columnMetadata: TableColumnMetadata, public open val targetSchema: ColumnSchema, public open val valuePreprocessor: DbValuePreprocessor?, public open val columnPostprocessor: DbColumnPostprocessor?, ) { - public open fun preprocess(value: J): D { + public open fun preprocess(value: J?): D? { valuePreprocessor?.let { valuePreprocessor -> return valuePreprocessor.preprocess(value, this) } - return value as D + return value as D? } public open fun postprocess(column: DataColumn): DataColumn

{ @@ -41,49 +40,39 @@ public open class DbColumnTypeInformation( public fun DbColumnTypeInformation<*, *, *>.cast(): DbColumnTypeInformation = this as DbColumnTypeInformation -public fun dbColumnTypeInformation( - columnMetadata: TableColumnMetadata, - targetSchema: ColumnSchema, -): DbColumnTypeInformation = +public fun dbColumnTypeInformation(targetSchema: ColumnSchema): DbColumnTypeInformation = DbColumnTypeInformation( - columnMetadata = columnMetadata, targetSchema = targetSchema, valuePreprocessor = null, columnPostprocessor = null, ) public fun dbColumnTypeInformationWithPreprocessing( - columnMetadata: TableColumnMetadata, targetSchema: ColumnSchema, valuePreprocessor: DbValuePreprocessor?, ): DbColumnTypeInformation = DbColumnTypeInformation( - columnMetadata = columnMetadata, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, columnPostprocessor = null, ) public fun dbColumnTypeInformationWithPostprocessing( - columnMetadata: TableColumnMetadata, targetSchema: ColumnSchema, columnPostprocessor: DbColumnPostprocessor?, ): DbColumnTypeInformation = DbColumnTypeInformation( - columnMetadata = columnMetadata, targetSchema = targetSchema, valuePreprocessor = null, columnPostprocessor = columnPostprocessor, ) public fun dbColumnTypeInformation( - columnMetadata: TableColumnMetadata, targetSchema: ColumnSchema, valuePreprocessor: DbValuePreprocessor?, columnPostprocessor: DbColumnPostprocessor?, ): DbColumnTypeInformation = DbColumnTypeInformation( - columnMetadata = columnMetadata, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, columnPostprocessor = columnPostprocessor, @@ -108,9 +97,9 @@ public fun interface DbValuePreprocessor { * return a [org.jetbrains.kotlinx.dataframe.DataFrame] here. */ public fun preprocess( - jdbcValue: J, + jdbcValue: J?, dbColumnTypeInformation: DbColumnTypeInformation<@UnsafeVariance J, @UnsafeVariance D, *>, - ): D + ): D? } public fun DbValuePreprocessor<*, *>.cast(): DbValuePreprocessor = this as DbValuePreprocessor diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index baaa756228..b01398b8d3 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -183,7 +183,6 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { } return dbColumnTypeInformationWithPostprocessing( - columnMetadata = tableColumnMetadata, targetSchema = ColumnSchema.Value(kType.withNullability(tableColumnMetadata.isNullable)), columnPostprocessor = postprocessor?.cast(), ) @@ -211,9 +210,9 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { } as J public fun preprocessValuesFromResultSet( - value: J, + value: J?, dbColumnTypeInformation: DbColumnTypeInformation, - ): D = dbColumnTypeInformation.preprocess(value) + ): D? = dbColumnTypeInformation.preprocess(value) public open fun buildDataColumn( name: String, diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index b353a206a2..859cd2fbb1 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -3,6 +3,8 @@ package org.jetbrains.kotlinx.dataframe.io.db import io.github.oshai.kotlinlogging.KotlinLogging import kotlinx.datetime.LocalDate import kotlinx.datetime.LocalTime +import kotlinx.datetime.toKotlinLocalDate +import kotlinx.datetime.toKotlinLocalTime import org.duckdb.DuckDBColumnType.ARRAY import org.duckdb.DuckDBColumnType.BIGINT import org.duckdb.DuckDBColumnType.BIT @@ -52,7 +54,7 @@ import org.jetbrains.kotlinx.dataframe.api.map import org.jetbrains.kotlinx.dataframe.api.single import org.jetbrains.kotlinx.dataframe.columns.ValueColumn import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig -import org.jetbrains.kotlinx.dataframe.io.db.DuckDb.convertSqlTypeToKType +import org.jetbrains.kotlinx.dataframe.io.db.dbColumnTypeInformation import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.math.BigDecimal @@ -64,6 +66,7 @@ import java.sql.DriverManager import java.sql.ResultSet import java.sql.Struct import java.util.Properties +import kotlin.collections.toList import kotlin.reflect.KType import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType @@ -72,6 +75,7 @@ import kotlin.reflect.typeOf import kotlin.time.Instant import kotlin.time.toKotlinInstant import kotlin.uuid.Uuid +import kotlin.uuid.toKotlinUuid import java.sql.Array as SqlArray import java.sql.Timestamp as SqlTimestamp import java.time.LocalDate as JavaLocalDate @@ -93,57 +97,215 @@ public object DuckDb : DbType("duckdb") { /** the name of the class of the DuckDB JDBC driver */ override val driverClassName: String = "org.duckdb.DuckDBDriver" - /** - * TODO: Unclear what this returned [KType] is useful for. Let's remove this function and just have - * [convertSqlTypeToColumnSchemaValue] - */ - override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType = - convertSqlTypeToColumnSchemaValue(tableColumnMetadata).type + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation = + parseDuckDbType(tableColumnMetadata.sqlTypeName, tableColumnMetadata.isNullable) /** - * How a column from JDBC should be represented as DataFrame (value) column - * See [convertSqlTypeToKType]. + * How a column type from JDBC, [sqlTypeName], is read in Java/Kotlin. + * The returned type must exactly follow [ResultSet.getObject] of your specific database's JDBC driver. + * Returning `null` defer the implementation to the default one (which may not always be correct). + * + * Following [org.duckdb.DuckDBVector.getObject] and converting the result to + * */ - override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema = - parseDuckDbType(tableColumnMetadata.sqlTypeName, tableColumnMetadata.isNullable).targetSchema + internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyDbColumnTypeInformation = + when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { + BOOLEAN -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - /** - * TODO: This function achieves the same goal as [convertSqlTypeToKType]. - */ - override fun makeCommonSqlToKTypeMapping(tableColumnMetadata: TableColumnMetadata): Nothing = - error("This function should not be called. Or exist, for that matter...") + TINYINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - /** - * TODO: I wanted to do the conversion here, but as I have no source type ánd target type - * it's impossible. - * It would be easier to do conversion on the entire column because we can borrow [DataColumn.convertTo]. - */ - override fun buildDataColumn( - name: String, - values: MutableList, - kType: KType, - inferNullability: Boolean, - ): DataColumn<*> { - val sourceType = kType - return super.buildDataColumn(name, values, kType, inferNullability) + SMALLINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + INTEGER -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + BIGINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + HUGEINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + UHUGEINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + UTINYINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + USMALLINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + UINTEGER -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + UBIGINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + FLOAT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + DOUBLE -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + DECIMAL -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + // DataFrame can do this conversion + TIME -> dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> it?.toKotlinLocalTime() } + + // todo? + TIME_WITH_TIME_ZONE -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + DATE -> dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> + it?.toKotlinLocalDate() + } + + TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> + dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> + it?.toInstant()?.toKotlinInstant() + } + + // todo? + TIMESTAMP_WITH_TIME_ZONE -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + // TODO! + JSON -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + BLOB -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + UUID -> dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> it?.toKotlinUuid() } + + MAP -> { + val (key, value) = parseMapTypes(sqlTypeName) + + val targetMapType = Map::class.createType( + listOf( + KTypeProjection.invariant(parseDuckDbType(key, false).targetSchema.type), + KTypeProjection.invariant(parseDuckDbType(value, true).targetSchema.type), + ), + ) + + dbColumnTypeInformation>(ColumnSchema.Value(targetMapType)) + } + + LIST, ARRAY -> { + // TODO requires #1266 and #1273 for specific types + val listType = parseListType(sqlTypeName) + val parsedListType = parseDuckDbType(listType, true) + val targetListType = List::class.createType( + listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), + ) + // todo maybe List should become FrameColumn + dbColumnTypeInformationWithPreprocessing>( + ColumnSchema.Value(targetListType), + ) { it, typeInfo -> + it?.toList() + } + } + + // TODO requires #1266 for specific types + STRUCT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + // Cannot handle this in Kotlin + UNION -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + VARCHAR -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + UNKNOWN, BIT, INTERVAL, ENUM -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + } + + private fun SqlArray.toList(): List = + when (val array = this.array) { + is IntArray -> array.toList() + is LongArray -> array.toList() + is ShortArray -> array.toList() + is ByteArray -> array.toList() + is FloatArray -> array.toList() + is DoubleArray -> array.toList() + is BooleanArray -> array.toList() + is CharArray -> array.toList() + is Array<*> -> array.toList() + is SqlArray -> array.toList() + else -> error("unknown array type $array") + } + + /** Parses "MAP(X, Y)" into "X" and "Y", taking parentheses into account */ + internal fun parseMapTypes(typeString: String): Pair { + if (!typeString.startsWith("MAP(") || !typeString.endsWith(")")) { + error("invalid MAP type: $typeString") + } + + val content = typeString.removeSurrounding("MAP(", ")") + + // Find the comma that separates key and value types + var parenCount = 0 + var commaIndex = -1 + for (i in content.indices) { + when (content[i]) { + '(' -> parenCount++ + + ')' -> parenCount-- + + ',' -> if (parenCount == 0) { + commaIndex = i + break + } + } + } + + if (commaIndex == -1) error("invalid MAP type: $typeString") + val keyType = content.take(commaIndex).trim() + val valueType = content.substring(commaIndex + 1).trim() + return Pair(keyType, valueType) } - override fun extractValueFromResultSet( - rs: ResultSet, - columnIndex: Int, - columnMetadata: TableColumnMetadata, - kType: KType, - ): Any? { - // TODO This '+ 1' is easily forgotten if I need to override this function to do any conversion - val result = rs.getObject(columnIndex + 1) - - // TODO: where is the [ColumnSchema] when I need it? - // Now I need to call my [parseDuckDbType] function again... - val parsedType = parseDuckDbType(columnMetadata.sqlTypeName, columnMetadata.isNullable) - - // TODO doing it as a column - val convertedResult = parsedType.converter(columnOf(result)).single() - return convertedResult + /** Parses "X[]" and "X[123]" into "X", and "X[][]" into "X[]" */ + internal fun parseListType(typeString: String): String { + if (!typeString.endsWith("]")) { + error("invalid LIST/ARRAY type: $typeString") + } + + return typeString.take(typeString.indexOfLast { it == '[' }) } /** @@ -206,185 +368,3 @@ public object DuckDb : DbType("duckdb") { private fun String.isInMemoryDuckDb(): Boolean = this.trim() == "jdbc:duckdb:" || matches("jdbc:duckdb:\\s*$".toRegex()) } - -/** - * How a column type from JDBC, [sqlTypeName], is read in Java/Kotlin. - * The returned type must exactly follow [ResultSet.getObject] of your specific database's JDBC driver. - * Returning `null` defer the implementation to the default one (which may not always be correct). - * - * Following [org.duckdb.DuckDBVector.getObject] and converting the result to - * - */ -internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): ParsedType = - when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { - BOOLEAN -> parsedTypeForValueColumnOf(isNullable) - - TINYINT -> parsedTypeForValueColumnOf(isNullable) - - SMALLINT -> parsedTypeForValueColumnOf(isNullable) - - INTEGER -> parsedTypeForValueColumnOf(isNullable) - - BIGINT -> parsedTypeForValueColumnOf(isNullable) - - HUGEINT -> parsedTypeForValueColumnOf(isNullable) - - UHUGEINT -> parsedTypeForValueColumnOf(isNullable) - - UTINYINT -> parsedTypeForValueColumnOf(isNullable) - - USMALLINT -> parsedTypeForValueColumnOf(isNullable) - - UINTEGER -> parsedTypeForValueColumnOf(isNullable) - - UBIGINT -> parsedTypeForValueColumnOf(isNullable) - - FLOAT -> parsedTypeForValueColumnOf(isNullable) - - DOUBLE -> parsedTypeForValueColumnOf(isNullable) - - DECIMAL -> parsedTypeForValueColumnOf(isNullable) - - // DataFrame can do this conversion - TIME -> parsedTypeForValueColumnOf(isNullable) { it.convertTo() } - - // todo? - TIME_WITH_TIME_ZONE -> parsedTypeForValueColumnOf(isNullable) - - // DataFrame can do this conversion - DATE -> parsedTypeForValueColumnOf(isNullable) { it.convertTo() } - - TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> - parsedTypeForValueColumnOf(isNullable) { - it.map { - it?.toInstant()?.toKotlinInstant() - }.asValueColumn().cast() - } - - // todo? - TIMESTAMP_WITH_TIME_ZONE -> parsedTypeForValueColumnOf(isNullable) - - // TODO! - JSON -> parsedTypeForValueColumnOf(isNullable) - - BLOB -> parsedTypeForValueColumnOf(isNullable) - - UUID -> parsedTypeForValueColumnOf(isNullable) { it.convertTo() } - - MAP -> { - val (key, value) = parseMapTypes(sqlTypeName) - val sourceMapType = Map::class.createType( - listOf( - KTypeProjection.invariant(parseDuckDbType(key, false).sourceType), - KTypeProjection.invariant(parseDuckDbType(value, true).sourceType), - ), - ) - val targetMapType = Map::class.createType( - listOf( - KTypeProjection.invariant(parseDuckDbType(key, false).targetSchema.type), - KTypeProjection.invariant(parseDuckDbType(value, true).targetSchema.type), - ), - ) - - ParsedType( - sourceType = sourceMapType, - targetSchema = ColumnSchema.Value(targetMapType), - converter = { it }, - ) - } - - LIST, ARRAY -> { - // TODO requires #1266 and #1273 for specific types - val listType = parseListType(sqlTypeName) - val parsedListType = parseDuckDbType(listType, true) - val targetListType = List::class.createType( - listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), - ) - // todo maybe List should become FrameColumn - ParsedType( - sourceType = typeOf(), - targetSchema = ColumnSchema.Value(targetListType), - converter = { it }, - ) - } - - // TODO requires #1266 for specific types - STRUCT -> parsedTypeForValueColumnOf(isNullable) - - // Cannot handle this in Kotlin - UNION -> parsedTypeForValueColumnOf(isNullable) - - VARCHAR -> parsedTypeForValueColumnOf(isNullable) - - UNKNOWN, BIT, INTERVAL, ENUM -> parsedTypeForValueColumnOf(isNullable) - } - -/** - * @property sourceType the source type of the column as read by [ResultSet.getObject] of our specific database's JDBC driver. - * @property targetSchema the target schema of the column. This can have a different [kType][ColumnSchema.type] than [sourceType]! - * If so, the values need to be converted in [DbType.buildDataColumn]. - * @property converter a function that converts the source column to the target column type - */ -internal data class ParsedType( - val sourceType: KType, - val targetSchema: ColumnSchema, - val converter: (DataColumn<*>) -> DataColumn<*>, -) - -internal inline fun parsedTypeForValueColumnOf(isNullable: Boolean): ParsedType { - val type = typeOf().withNullability(isNullable) - return ParsedType( - sourceType = type, - targetSchema = ColumnSchema.Value(type), - converter = { it }, - ) -} - -internal inline fun parsedTypeForValueColumnOf( - isNullable: Boolean, - noinline converter: (DataColumn) -> DataColumn, -): ParsedType = - ParsedType( - sourceType = typeOf().withNullability(isNullable), - targetSchema = ColumnSchema.Value(typeOf().withNullability(isNullable)), - converter = converter as (DataColumn<*>) -> DataColumn<*>, - ) - -/** Parses "MAP(X, Y)" into "X" and "Y", taking parentheses into account */ -internal fun parseMapTypes(typeString: String): Pair { - if (!typeString.startsWith("MAP(") || !typeString.endsWith(")")) { - error("invalid MAP type: $typeString") - } - - val content = typeString.removeSurrounding("MAP(", ")") - - // Find the comma that separates key and value types - var parenCount = 0 - var commaIndex = -1 - for (i in content.indices) { - when (content[i]) { - '(' -> parenCount++ - - ')' -> parenCount-- - - ',' -> if (parenCount == 0) { - commaIndex = i - break - } - } - } - - if (commaIndex == -1) error("invalid MAP type: $typeString") - val keyType = content.take(commaIndex).trim() - val valueType = content.substring(commaIndex + 1).trim() - return Pair(keyType, valueType) -} - -/** Parses "X[]" and "X[123]" into "X", and "X[][]" into "X[]" */ -internal fun parseListType(typeString: String): String { - if (!typeString.endsWith("]")) { - error("invalid LIST/ARRAY type: $typeString") - } - - return typeString.take(typeString.indexOfLast { it == '[' }) -} diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt index 172c6ee2f1..fe05dbac9b 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt @@ -29,18 +29,12 @@ public object MariaDb : DbType("mariadb") { tableColumnMetadata.sqlTypeName == "INT UNSIGNED" ) { val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation( - columnMetadata = tableColumnMetadata, - targetSchema = ColumnSchema.Value(kType), - ) + return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) } if (tableColumnMetadata.sqlTypeName == "SMALLINT" && tableColumnMetadata.javaClassName == "java.lang.Short") { val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation( - columnMetadata = tableColumnMetadata, - targetSchema = ColumnSchema.Value(kType), - ) + return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) } return super.generateTypeInformation(tableColumnMetadata) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt index 8f17d99a6a..cf5c2283d4 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt @@ -19,10 +19,7 @@ public object MySql : DbType("mysql") { override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { if (tableColumnMetadata.sqlTypeName == "INT UNSIGNED") { val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation( - columnMetadata = tableColumnMetadata, - targetSchema = ColumnSchema.Value(kType), - ) + return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) } return super.generateTypeInformation(tableColumnMetadata) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt index fa5ea022ef..b4df0453da 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt @@ -20,10 +20,7 @@ public object PostgreSql : DbType("postgresql") { // because of https://github.com/pgjdbc/pgjdbc/issues/425 if (tableColumnMetadata.sqlTypeName == "money") { val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation( - columnMetadata = tableColumnMetadata, - targetSchema = ColumnSchema.Value(kType), - ) + return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) } return super.generateTypeInformation(tableColumnMetadata) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt index e5139b06cc..94dd176805 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt @@ -937,7 +937,7 @@ private fun readAllRowsFromResultSet( columnIndex = columnIndex, typeInformation = typeInformation, ) - val preprocessedValue = typeInformation.preprocess(value) + val preprocessedValue = dbType.preprocessValuesFromResultSet(value, typeInformation) columnData[columnIndex].add(preprocessedValue) } rowsRead++ @@ -967,7 +967,7 @@ private fun buildDataFrameFromColumnData( typeInformation = typeInformation, inferNullability = inferNullability, ) - val postProcessedColumn = typeInformation.postprocess(column) + val postProcessedColumn = dbType.postProcessDataColumn(column, typeInformation) if (checkSchema) { postProcessedColumn.checkSchema(typeInformation.targetSchema) From 0ee702490ac9ee6871d95bc7b0823338a6007c1a Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 10 Dec 2025 17:32:21 +0100 Subject: [PATCH 04/14] wip DuckDb nested preprocessing --- .../kotlinx/dataframe/io/db/DuckDb.kt | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index 859cd2fbb1..d06a314164 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -210,28 +210,44 @@ public object DuckDb : DbType("duckdb") { MAP -> { val (key, value) = parseMapTypes(sqlTypeName) + val parsedKeyType = parseDuckDbType(key, false) + val parsedValueType = + parseDuckDbType(value, true).cast() + val targetMapType = Map::class.createType( listOf( - KTypeProjection.invariant(parseDuckDbType(key, false).targetSchema.type), - KTypeProjection.invariant(parseDuckDbType(value, true).targetSchema.type), + KTypeProjection.invariant(parsedKeyType.targetSchema.type), + KTypeProjection.invariant(parsedValueType.targetSchema.type), ), ) - dbColumnTypeInformation>(ColumnSchema.Value(targetMapType)) + dbColumnTypeInformationWithPreprocessing, Map>( + ColumnSchema.Value(targetMapType), + ) { map, _ -> + // only need to preprocess the values, as the keys are just Strings + map?.mapValues { (_, value) -> + parsedValueType.preprocess(value) + } + } } LIST, ARRAY -> { // TODO requires #1266 and #1273 for specific types val listType = parseListType(sqlTypeName) - val parsedListType = parseDuckDbType(listType, true) + val parsedListType = + parseDuckDbType(listType, true).cast() + val targetListType = List::class.createType( listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), ) + // todo maybe List should become FrameColumn dbColumnTypeInformationWithPreprocessing>( ColumnSchema.Value(targetListType), - ) { it, typeInfo -> - it?.toList() + ) { array, _ -> + array + ?.toList() + ?.map(parsedListType::preprocess) // recursively preprocess } } From f96234db0a8becbb11b8f8686e437fe397a37d28 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 10 Dec 2025 19:38:18 +0100 Subject: [PATCH 05/14] added memoization for TableColumnMetadata -> AnyDbColumnTypeInformation --- .../io/db/DbColumnTypeInformation.kt | 4 +- .../kotlinx/dataframe/io/db/DbType.kt | 11 + .../kotlinx/dataframe/io/db/DuckDb.kt | 282 +++++++++--------- .../dataframe/io/readDataFrameSchema.kt | 2 +- .../kotlinx/dataframe/io/readJdbc.kt | 2 +- 5 files changed, 159 insertions(+), 142 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt index 3db5f9f066..9311f7422a 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt @@ -8,7 +8,9 @@ public typealias AnyDbColumnTypeInformation = DbColumnTypeInformation<*, *, *> /** * Represents all type information that can be retrieved from an SQL column. - * This can be extended for your specific [DbType2] if you need extra information. + * This can be extended for your specific [DbType] if you need extra information. + * + * This class needs to be stateless, so it can be memoized in [DbType.getOrGenerateTypeInformation]. * * @property targetSchema the target schema of the column after running the optional * [valuePreprocessor] and [columnPostprocessor]. diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index b01398b8d3..efb7c24227 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -134,8 +134,19 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { Types.TIMESTAMP_WITH_TIMEZONE to typeOf(), ) + private val typeInformationCache = mutableMapOf() + + /** + * Returns a [DbColumnTypeInformation] produced from [tableColumnMetadata]. + */ + public fun getOrGenerateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation = + typeInformationCache.getOrPut(tableColumnMetadata) { generateTypeInformation(tableColumnMetadata) } + /** * Returns a [DbColumnTypeInformation] produced from [tableColumnMetadata]. + * + * This function can be overridden by returning your own [DbColumnTypeInformation] or a subtype of that. + * Do note that this class needs to be stateless, so this function can be memoized. */ public open fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { val kType = when { diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index d06a314164..bdfb8a194e 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -100,6 +100,8 @@ public object DuckDb : DbType("duckdb") { override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation = parseDuckDbType(tableColumnMetadata.sqlTypeName, tableColumnMetadata.isNullable) + private val duckDbTypeCache = mutableMapOf, AnyDbColumnTypeInformation>() + /** * How a column type from JDBC, [sqlTypeName], is read in Java/Kotlin. * The returned type must exactly follow [ResultSet.getObject] of your specific database's JDBC driver. @@ -109,165 +111,167 @@ public object DuckDb : DbType("duckdb") { * */ internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyDbColumnTypeInformation = - when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { - BOOLEAN -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - TINYINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - SMALLINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - INTEGER -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - BIGINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - HUGEINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - UHUGEINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - UTINYINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - USMALLINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - UINTEGER -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - UBIGINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - FLOAT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - DOUBLE -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - DECIMAL -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - // DataFrame can do this conversion - TIME -> dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) { it, _ -> it?.toKotlinLocalTime() } - - // todo? - TIME_WITH_TIME_ZONE -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - DATE -> dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) { it, _ -> - it?.toKotlinLocalDate() - } + duckDbTypeCache.getOrPut(Pair(sqlTypeName, isNullable)) { + when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { + BOOLEAN -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> - dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) { it, _ -> - it?.toInstant()?.toKotlinInstant() - } + TINYINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + SMALLINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + INTEGER -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + BIGINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - // todo? - TIMESTAMP_WITH_TIME_ZONE -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + HUGEINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - // TODO! - JSON -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UHUGEINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - BLOB -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UTINYINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - UUID -> dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) { it, _ -> it?.toKotlinUuid() } + USMALLINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - MAP -> { - val (key, value) = parseMapTypes(sqlTypeName) + UINTEGER -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - val parsedKeyType = parseDuckDbType(key, false) - val parsedValueType = - parseDuckDbType(value, true).cast() + UBIGINT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - val targetMapType = Map::class.createType( - listOf( - KTypeProjection.invariant(parsedKeyType.targetSchema.type), - KTypeProjection.invariant(parsedValueType.targetSchema.type), - ), + FLOAT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), ) - dbColumnTypeInformationWithPreprocessing, Map>( - ColumnSchema.Value(targetMapType), - ) { map, _ -> - // only need to preprocess the values, as the keys are just Strings - map?.mapValues { (_, value) -> - parsedValueType.preprocess(value) - } + DOUBLE -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + DECIMAL -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + // DataFrame can do this conversion + TIME -> dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> it?.toKotlinLocalTime() } + + // todo? + TIME_WITH_TIME_ZONE -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + DATE -> dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> + it?.toKotlinLocalDate() } - } - LIST, ARRAY -> { - // TODO requires #1266 and #1273 for specific types - val listType = parseListType(sqlTypeName) - val parsedListType = - parseDuckDbType(listType, true).cast() + TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> + dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> + it?.toInstant()?.toKotlinInstant() + } + + // todo? + TIMESTAMP_WITH_TIME_ZONE -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + + // TODO! + JSON -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - val targetListType = List::class.createType( - listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), + BLOB -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), ) - // todo maybe List should become FrameColumn - dbColumnTypeInformationWithPreprocessing>( - ColumnSchema.Value(targetListType), - ) { array, _ -> - array - ?.toList() - ?.map(parsedListType::preprocess) // recursively preprocess + UUID -> dbColumnTypeInformationWithPreprocessing( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) { it, _ -> it?.toKotlinUuid() } + + MAP -> { + val (key, value) = parseMapTypes(sqlTypeName) + + val parsedKeyType = parseDuckDbType(key, false) + val parsedValueType = + parseDuckDbType(value, true).cast() + + val targetMapType = Map::class.createType( + listOf( + KTypeProjection.invariant(parsedKeyType.targetSchema.type), + KTypeProjection.invariant(parsedValueType.targetSchema.type), + ), + ) + + dbColumnTypeInformationWithPreprocessing, Map>( + ColumnSchema.Value(targetMapType), + ) { map, _ -> + // only need to preprocess the values, as the keys are just Strings + map?.mapValues { (_, value) -> + parsedValueType.preprocess(value) + } + } + } + + LIST, ARRAY -> { + // TODO requires #1266 and #1273 for specific types + val listType = parseListType(sqlTypeName) + val parsedListType = + parseDuckDbType(listType, true).cast() + + val targetListType = List::class.createType( + listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), + ) + + // todo maybe List should become FrameColumn + dbColumnTypeInformationWithPreprocessing>( + ColumnSchema.Value(targetListType), + ) { array, _ -> + array + ?.toList() + ?.map(parsedListType::preprocess) // recursively preprocess + } } - } - // TODO requires #1266 for specific types - STRUCT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + // TODO requires #1266 for specific types + STRUCT -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - // Cannot handle this in Kotlin - UNION -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + // Cannot handle this in Kotlin + UNION -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - VARCHAR -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + VARCHAR -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) - UNKNOWN, BIT, INTERVAL, ENUM -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UNKNOWN, BIT, INTERVAL, ENUM -> dbColumnTypeInformation( + ColumnSchema.Value(typeOf().withNullability(isNullable)), + ) + } } private fun SqlArray.toList(): List = diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt index 4684db0567..8944631656 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt @@ -451,7 +451,7 @@ internal fun buildSchemaByTableColumns( dbType: DbType, ): DataFrameSchema { val schemaColumns = tableColumns.associate { - it.name to dbType.generateTypeInformation(it).targetSchema + it.name to dbType.getOrGenerateTypeInformation(it).targetSchema } return DataFrameSchemaImpl(columns = schemaColumns) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt index 94dd176805..4e71d5010b 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt @@ -912,7 +912,7 @@ private fun buildColumnTypeInformation( dbType: DbType, ): List = tableColumns.indices.map { index -> - dbType.generateTypeInformation(tableColumns[index]) + dbType.getOrGenerateTypeInformation(tableColumns[index]) } /** From d1b655ba4263585a1931b9acab5feebca43d2b86 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 11 Dec 2025 13:34:55 +0100 Subject: [PATCH 06/14] renaming, added extra constructors for TypeInformation, restricting types --- .../io/db/DbColumnTypeInformation.kt | 124 ------------ .../kotlinx/dataframe/io/db/DbType.kt | 48 ++--- .../kotlinx/dataframe/io/db/DuckDb.kt | 151 +++++--------- .../jetbrains/kotlinx/dataframe/io/db/H2.kt | 2 +- .../kotlinx/dataframe/io/db/MariaDb.kt | 8 +- .../kotlinx/dataframe/io/db/MySql.kt | 5 +- .../kotlinx/dataframe/io/db/PostgreSql.kt | 5 +- .../dataframe/io/db/TypeInformation.kt | 187 ++++++++++++++++++ .../kotlinx/dataframe/io/readJdbc.kt | 13 +- 9 files changed, 270 insertions(+), 273 deletions(-) delete mode 100644 dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt create mode 100644 dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt deleted file mode 100644 index 9311f7422a..0000000000 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbColumnTypeInformation.kt +++ /dev/null @@ -1,124 +0,0 @@ -package org.jetbrains.kotlinx.dataframe.io.db - -import org.jetbrains.kotlinx.dataframe.DataColumn -import org.jetbrains.kotlinx.dataframe.api.cast -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema - -public typealias AnyDbColumnTypeInformation = DbColumnTypeInformation<*, *, *> - -/** - * Represents all type information that can be retrieved from an SQL column. - * This can be extended for your specific [DbType] if you need extra information. - * - * This class needs to be stateless, so it can be memoized in [DbType.getOrGenerateTypeInformation]. - * - * @property targetSchema the target schema of the column after running the optional - * [valuePreprocessor] and [columnPostprocessor]. - * @property valuePreprocessor an optional function that converts values from [java.sql.ResultSet.getObject] - * to a cell/row suitable to be put into a [org.jetbrains.kotlinx.dataframe.DataColumn]. - * @property columnPostprocessor an optional function that converts a [org.jetbrains.kotlinx.dataframe.DataColumn] with values of type [D] - * to a [org.jetbrains.kotlinx.dataframe.DataColumn] of with values of type [P]. - */ -public open class DbColumnTypeInformation( - public open val targetSchema: ColumnSchema, - public open val valuePreprocessor: DbValuePreprocessor?, - public open val columnPostprocessor: DbColumnPostprocessor?, -) { - public open fun preprocess(value: J?): D? { - valuePreprocessor?.let { valuePreprocessor -> - return valuePreprocessor.preprocess(value, this) - } - return value as D? - } - - public open fun postprocess(column: DataColumn): DataColumn

{ - columnPostprocessor?.let { columnPostprocessor -> - return columnPostprocessor.postprocess(column, this) - } - return column.cast() - } -} - -public fun DbColumnTypeInformation<*, *, *>.cast(): DbColumnTypeInformation = - this as DbColumnTypeInformation - -public fun dbColumnTypeInformation(targetSchema: ColumnSchema): DbColumnTypeInformation = - DbColumnTypeInformation( - targetSchema = targetSchema, - valuePreprocessor = null, - columnPostprocessor = null, - ) - -public fun dbColumnTypeInformationWithPreprocessing( - targetSchema: ColumnSchema, - valuePreprocessor: DbValuePreprocessor?, -): DbColumnTypeInformation = - DbColumnTypeInformation( - targetSchema = targetSchema, - valuePreprocessor = valuePreprocessor, - columnPostprocessor = null, - ) - -public fun dbColumnTypeInformationWithPostprocessing( - targetSchema: ColumnSchema, - columnPostprocessor: DbColumnPostprocessor?, -): DbColumnTypeInformation = - DbColumnTypeInformation( - targetSchema = targetSchema, - valuePreprocessor = null, - columnPostprocessor = columnPostprocessor, - ) - -public fun dbColumnTypeInformation( - targetSchema: ColumnSchema, - valuePreprocessor: DbValuePreprocessor?, - columnPostprocessor: DbColumnPostprocessor?, -): DbColumnTypeInformation = - DbColumnTypeInformation( - targetSchema = targetSchema, - valuePreprocessor = valuePreprocessor, - columnPostprocessor = columnPostprocessor, - ) - -/** - * This preprocessor can be created for types where you want to convert the values - * coming from [java.sql.ResultSet.getObject] to a different type more suitable to be put in a [DataColumn] - * - * @param J the type of the value coming from the JDBC driver. - * @param D the type of the column values after preprocessing. - */ -public fun interface DbValuePreprocessor { - - /** - * Converts the given [jdbcValue]: [J] to a [D]. - * - * If you intend to create a [org.jetbrains.kotlinx.dataframe.columns.ColumnGroup], - * return a [org.jetbrains.kotlinx.dataframe.DataRow] here. - * - * If you intend to create a [org.jetbrains.kotlinx.dataframe.columns.FrameColumn], - * return a [org.jetbrains.kotlinx.dataframe.DataFrame] here. - */ - public fun preprocess( - jdbcValue: J?, - dbColumnTypeInformation: DbColumnTypeInformation<@UnsafeVariance J, @UnsafeVariance D, *>, - ): D? -} - -public fun DbValuePreprocessor<*, *>.cast(): DbValuePreprocessor = this as DbValuePreprocessor - -/** - * @param D the type of the column values before postprocessing. - * @param P the type of the column values after postprocessing. - */ -public fun interface DbColumnPostprocessor { - - /** - * Converts the given [column]: [DataColumn] with values of type [D] to a [DataColumn] of with values of type [P]. - */ - public fun postprocess( - column: DataColumn, - dbColumnTypeInformation: DbColumnTypeInformation<*, @UnsafeVariance D, @UnsafeVariance P>, - ): DataColumn

-} - -public fun DbColumnPostprocessor<*, *>.cast(): DbColumnPostprocessor = this as DbColumnPostprocessor diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index efb7c24227..0a78c714bf 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -134,21 +134,21 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { Types.TIMESTAMP_WITH_TIMEZONE to typeOf(), ) - private val typeInformationCache = mutableMapOf() + private val typeInformationCache = mutableMapOf() /** - * Returns a [DbColumnTypeInformation] produced from [tableColumnMetadata]. + * Returns a [TypeInformation] produced from [tableColumnMetadata]. */ - public fun getOrGenerateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation = + public fun getOrGenerateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation = typeInformationCache.getOrPut(tableColumnMetadata) { generateTypeInformation(tableColumnMetadata) } /** - * Returns a [DbColumnTypeInformation] produced from [tableColumnMetadata]. + * Returns a [TypeInformation] produced from [tableColumnMetadata]. * - * This function can be overridden by returning your own [DbColumnTypeInformation] or a subtype of that. + * This function can be overridden by returning your own [TypeInformation] or a subtype of that. * Do note that this class needs to be stateless, so this function can be memoized. */ - public open fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { + public open fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { val kType = when { tableColumnMetadata.jdbcType == Types.OTHER -> when (tableColumnMetadata.javaClassName) { @@ -186,16 +186,16 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { val postprocessor = when (tableColumnMetadata.jdbcType) { Types.ARRAY -> - DbColumnPostprocessor, Any?> { column, _ -> + DbColumnPostprocessor, Any> { column, _ -> handleArrayValues(column.asValueColumn()) } else -> null } - return dbColumnTypeInformationWithPostprocessing( + return typeInformationWithPostprocessingFor( targetSchema = ColumnSchema.Value(kType.withNullability(tableColumnMetadata.isNullable)), - columnPostprocessor = postprocessor?.cast(), + columnPostprocessor = postprocessor?.castToAny(), ) } @@ -208,29 +208,29 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { * @param [typeInformation] * @return the extracted value, or null */ - public open fun getValueFromResultSet( + public open fun getValueFromResultSet( rs: ResultSet, columnIndex: Int, - typeInformation: DbColumnTypeInformation, - ): J = + typeInformation: TypeInformation, + ): J? = try { rs.getObject(columnIndex + 1) } catch (_: Throwable) { // TODO? rs.getString(columnIndex + 1) - } as J + } as J? - public fun preprocessValuesFromResultSet( + public fun preprocessValuesFromResultSet( value: J?, - dbColumnTypeInformation: DbColumnTypeInformation, - ): D? = dbColumnTypeInformation.preprocess(value) + typeInformation: TypeInformation, + ): D? = typeInformation.preprocess(value) - public open fun buildDataColumn( + public open fun buildDataColumn( name: String, - values: List, - typeInformation: DbColumnTypeInformation<*, D, *>, + values: List, + typeInformation: TypeInformation<*, D, *>, inferNullability: Boolean, - ): DataColumn = + ): DataColumn = when (val schema = typeInformation.targetSchema) { is ColumnSchema.Value -> DataColumn.createValueColumn( @@ -254,10 +254,10 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { ).cast() } - public fun postProcessDataColumn( - column: DataColumn, - dbColumnTypeInformation: DbColumnTypeInformation<*, D, P>, - ): DataColumn

= dbColumnTypeInformation.postprocess(column) + public fun postProcessDataColumn( + column: DataColumn, + typeInformation: TypeInformation<*, D, P>, + ): DataColumn = typeInformation.postprocess(column) /** * Checks if the given table name is a system table for the specified database type. diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index bdfb8a194e..35b547924c 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -42,19 +42,8 @@ import org.duckdb.DuckDBColumnType.UUID import org.duckdb.DuckDBColumnType.VARCHAR import org.duckdb.DuckDBResultSetMetaData import org.duckdb.JsonNode -import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame -import org.jetbrains.kotlinx.dataframe.api.asValueColumn -import org.jetbrains.kotlinx.dataframe.api.cast -import org.jetbrains.kotlinx.dataframe.api.columnOf -import org.jetbrains.kotlinx.dataframe.api.convertTo -import org.jetbrains.kotlinx.dataframe.api.convertToLocalDate -import org.jetbrains.kotlinx.dataframe.api.convertToLocalTime -import org.jetbrains.kotlinx.dataframe.api.map -import org.jetbrains.kotlinx.dataframe.api.single -import org.jetbrains.kotlinx.dataframe.columns.ValueColumn import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig -import org.jetbrains.kotlinx.dataframe.io.db.dbColumnTypeInformation import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.math.BigDecimal @@ -67,11 +56,8 @@ import java.sql.ResultSet import java.sql.Struct import java.util.Properties import kotlin.collections.toList -import kotlin.reflect.KType import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType -import kotlin.reflect.full.withNullability -import kotlin.reflect.typeOf import kotlin.time.Instant import kotlin.time.toKotlinInstant import kotlin.uuid.Uuid @@ -97,10 +83,10 @@ public object DuckDb : DbType("duckdb") { /** the name of the class of the DuckDB JDBC driver */ override val driverClassName: String = "org.duckdb.DuckDBDriver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation = + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation = parseDuckDbType(tableColumnMetadata.sqlTypeName, tableColumnMetadata.isNullable) - private val duckDbTypeCache = mutableMapOf, AnyDbColumnTypeInformation>() + private val duckDbTypeCache = mutableMapOf, AnyTypeInformation>() /** * How a column type from JDBC, [sqlTypeName], is read in Java/Kotlin. @@ -110,104 +96,63 @@ public object DuckDb : DbType("duckdb") { * Following [org.duckdb.DuckDBVector.getObject] and converting the result to * */ - internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyDbColumnTypeInformation = + internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyTypeInformation = duckDbTypeCache.getOrPut(Pair(sqlTypeName, isNullable)) { when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { - BOOLEAN -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + BOOLEAN -> typeInformationForValueColumnOf(isNullable) - TINYINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + TINYINT -> typeInformationForValueColumnOf(isNullable) - SMALLINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + SMALLINT -> typeInformationForValueColumnOf(isNullable) - INTEGER -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + INTEGER -> typeInformationForValueColumnOf(isNullable) - BIGINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + BIGINT -> typeInformationForValueColumnOf(isNullable) - HUGEINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + HUGEINT -> typeInformationForValueColumnOf(isNullable) - UHUGEINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UHUGEINT -> typeInformationForValueColumnOf(isNullable) - UTINYINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UTINYINT -> typeInformationForValueColumnOf(isNullable) - USMALLINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + USMALLINT -> typeInformationForValueColumnOf(isNullable) - UINTEGER -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UINTEGER -> typeInformationForValueColumnOf(isNullable) - UBIGINT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UBIGINT -> typeInformationForValueColumnOf(isNullable) - FLOAT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + FLOAT -> typeInformationForValueColumnOf(isNullable) - DOUBLE -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + DOUBLE -> typeInformationForValueColumnOf(isNullable) - DECIMAL -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + DECIMAL -> typeInformationForValueColumnOf(isNullable) - // DataFrame can do this conversion - TIME -> dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), + TIME -> typeInformationWithPreprocessingForValueColumnOf( + isNullable = isNullable, ) { it, _ -> it?.toKotlinLocalTime() } // todo? - TIME_WITH_TIME_ZONE -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) - - DATE -> dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) { it, _ -> - it?.toKotlinLocalDate() - } + TIME_WITH_TIME_ZONE -> typeInformationForValueColumnOf(isNullable) + + DATE -> typeInformationWithPreprocessingForValueColumnOf( + isNullable = isNullable, + ) { it, _ -> it?.toKotlinLocalDate() } TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> - dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) { it, _ -> - it?.toInstant()?.toKotlinInstant() - } + typeInformationWithPreprocessingForValueColumnOf( + isNullable = isNullable, + ) { it, _ -> it?.toInstant()?.toKotlinInstant() } // todo? - TIMESTAMP_WITH_TIME_ZONE -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + TIMESTAMP_WITH_TIME_ZONE -> typeInformationForValueColumnOf(isNullable) // TODO! - JSON -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + JSON -> typeInformationForValueColumnOf(isNullable) - BLOB -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + BLOB -> typeInformationForValueColumnOf(isNullable) - UUID -> dbColumnTypeInformationWithPreprocessing( - ColumnSchema.Value(typeOf().withNullability(isNullable)), + UUID -> typeInformationWithPreprocessingForValueColumnOf( + isNullable = isNullable, ) { it, _ -> it?.toKotlinUuid() } MAP -> { @@ -215,7 +160,7 @@ public object DuckDb : DbType("duckdb") { val parsedKeyType = parseDuckDbType(key, false) val parsedValueType = - parseDuckDbType(value, true).cast() + parseDuckDbType(value, true).cast() val targetMapType = Map::class.createType( listOf( @@ -224,8 +169,8 @@ public object DuckDb : DbType("duckdb") { ), ) - dbColumnTypeInformationWithPreprocessing, Map>( - ColumnSchema.Value(targetMapType), + typeInformationWithPreprocessingForValueColumnOf, Map>( + targetColumnType = targetMapType, ) { map, _ -> // only need to preprocess the values, as the keys are just Strings map?.mapValues { (_, value) -> @@ -238,39 +183,31 @@ public object DuckDb : DbType("duckdb") { // TODO requires #1266 and #1273 for specific types val listType = parseListType(sqlTypeName) val parsedListType = - parseDuckDbType(listType, true).cast() + parseDuckDbType(listType, true).castToAny() val targetListType = List::class.createType( listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), ) // todo maybe List should become FrameColumn - dbColumnTypeInformationWithPreprocessing>( - ColumnSchema.Value(targetListType), - ) { array, _ -> - array + typeInformationWithPreprocessingFor>( + targetSchema = ColumnSchema.Value(targetListType), + ) { sqlArray, _ -> + sqlArray ?.toList() ?.map(parsedListType::preprocess) // recursively preprocess } } // TODO requires #1266 for specific types - STRUCT -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + STRUCT -> typeInformationForValueColumnOf(isNullable) // Cannot handle this in Kotlin - UNION -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UNION -> typeInformationForValueColumnOf(isNullable) - VARCHAR -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + VARCHAR -> typeInformationForValueColumnOf(isNullable) - UNKNOWN, BIT, INTERVAL, ENUM -> dbColumnTypeInformation( - ColumnSchema.Value(typeOf().withNullability(isNullable)), - ) + UNKNOWN, BIT, INTERVAL, ENUM -> typeInformationForValueColumnOf(isNullable) } } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt index 98f58d0350..d5c0a0e5b2 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt @@ -117,7 +117,7 @@ public open class H2(public val mode: Mode = Mode.Regular) : DbType("h2") { override val driverClassName: String get() = "org.h2.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation = + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation = delegate?.generateTypeInformation(tableColumnMetadata) ?: super.generateTypeInformation(tableColumnMetadata) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt index fe05dbac9b..191dadb57a 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt @@ -15,7 +15,7 @@ public object MariaDb : DbType("mariadb") { override val driverClassName: String get() = "org.mariadb.jdbc.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { // Force BIGINT to always be Long, regardless of javaClassName // MariaDB JDBC driver may report Integer for small BIGINT values // TODO: investigate the corner case @@ -28,13 +28,11 @@ public object MariaDb : DbType("mariadb") { if (tableColumnMetadata.sqlTypeName == "INTEGER UNSIGNED" || tableColumnMetadata.sqlTypeName == "INT UNSIGNED" ) { - val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) + return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) } if (tableColumnMetadata.sqlTypeName == "SMALLINT" && tableColumnMetadata.javaClassName == "java.lang.Short") { - val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) + return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) } return super.generateTypeInformation(tableColumnMetadata) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt index cf5c2283d4..c5cab02de1 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt @@ -16,10 +16,9 @@ public object MySql : DbType("mysql") { override val driverClassName: String get() = "com.mysql.jdbc.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { if (tableColumnMetadata.sqlTypeName == "INT UNSIGNED") { - val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) + return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) } return super.generateTypeInformation(tableColumnMetadata) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt index b4df0453da..edff12478b 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt @@ -16,11 +16,10 @@ public object PostgreSql : DbType("postgresql") { override val driverClassName: String get() = "org.postgresql.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyDbColumnTypeInformation { + override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { // because of https://github.com/pgjdbc/pgjdbc/issues/425 if (tableColumnMetadata.sqlTypeName == "money") { - val kType = typeOf().withNullability(tableColumnMetadata.isNullable) - return dbColumnTypeInformation(targetSchema = ColumnSchema.Value(kType)) + return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) } return super.generateTypeInformation(tableColumnMetadata) } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt new file mode 100644 index 0000000000..e9faa9b998 --- /dev/null +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt @@ -0,0 +1,187 @@ +package org.jetbrains.kotlinx.dataframe.io.db + +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema +import kotlin.reflect.KType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf + +public typealias AnyTypeInformation = TypeInformation<*, *, *> + +/** + * Represents all type information that can be retrieved from an SQL column. + * This can be extended for your specific [DbType] if you need extra information. + * + * This class needs to be stateless, so it can be memoized in [DbType.getOrGenerateTypeInformation]. + * + * + * @param J the type of the value coming from the JDBC driver. + * @param D the type of the column values after preprocessing. Will be equal to [J] if [valuePreprocessor] is `null`. + * @param P the type of the column values after postprocessing. Will be equal to [D] if [columnPostprocessor] is `null`. + * + * @property targetSchema the target schema of the column after running the optional + * [valuePreprocessor] and [columnPostprocessor]. + * @property valuePreprocessor an optional function that converts values from [java.sql.ResultSet.getObject] + * to a cell/row suitable to be put into a [DataColumn]. + * @property columnPostprocessor an optional function that converts a [DataColumn] with values of type [D] + * to a [DataColumn] of with values of type [P]. + */ +public open class TypeInformation( + public open val targetSchema: ColumnSchema, + public open val valuePreprocessor: DbValuePreprocessor?, + public open val columnPostprocessor: DbColumnPostprocessor?, +) { + public open fun preprocess(value: J?): D? { + valuePreprocessor?.let { valuePreprocessor -> + return valuePreprocessor.preprocess(value, this) + } + return value as D? + } + + public open fun postprocess(column: DataColumn): DataColumn { + columnPostprocessor?.let { columnPostprocessor -> + return columnPostprocessor.postprocess(column, this) + } + return column.cast() + } +} + +public fun TypeInformation<*, *, *>.cast(): TypeInformation = + this as TypeInformation + +public fun TypeInformation<*, *, *>.castToAny(): TypeInformation = cast() + +// region generic constructors + +public fun typeInformationWithProcessingFor( + targetSchema: ColumnSchema, + valuePreprocessor: DbValuePreprocessor?, + columnPostprocessor: DbColumnPostprocessor?, +): TypeInformation = + TypeInformation( + targetSchema = targetSchema, + valuePreprocessor = valuePreprocessor, + columnPostprocessor = columnPostprocessor, + ) + +public fun typeInformationFor(targetSchema: ColumnSchema): TypeInformation = + typeInformationWithProcessingFor( + targetSchema = targetSchema, + valuePreprocessor = null, + columnPostprocessor = null, + ) + +public fun typeInformationWithPreprocessingFor( + targetSchema: ColumnSchema, + valuePreprocessor: DbValuePreprocessor?, +): TypeInformation = + typeInformationWithProcessingFor( + targetSchema = targetSchema, + valuePreprocessor = valuePreprocessor, + columnPostprocessor = null, + ) + +public fun typeInformationWithPostprocessingFor( + targetSchema: ColumnSchema, + columnPostprocessor: DbColumnPostprocessor?, +): TypeInformation = + typeInformationWithProcessingFor( + targetSchema = targetSchema, + valuePreprocessor = null, + columnPostprocessor = columnPostprocessor, + ) + +// endregion + +// region ValueColumn constructors + +public fun typeInformationForValueColumnOf(kType: KType): TypeInformation = + typeInformationFor(targetSchema = ColumnSchema.Value(kType)) + +public inline fun typeInformationForValueColumnOf(isNullable: Boolean): TypeInformation = + typeInformationForValueColumnOf(typeOf().withNullability(isNullable)) + +public fun typeInformationWithPreprocessingForValueColumnOf( + targetColumnType: KType, + valuePreprocessor: DbValuePreprocessor?, +): TypeInformation = + typeInformationWithPreprocessingFor( + targetSchema = ColumnSchema.Value(targetColumnType), + valuePreprocessor = valuePreprocessor, + ) + +public inline fun typeInformationWithPreprocessingForValueColumnOf( + isNullable: Boolean, + valuePreprocessor: DbValuePreprocessor?, +): TypeInformation = + typeInformationWithPreprocessingForValueColumnOf( + targetColumnType = typeOf().withNullability(isNullable), + valuePreprocessor = valuePreprocessor, + ) + +public fun typeInformationWithPostprocessingForValueColumnOf( + targetColumnType: KType, + columnPostprocessor: DbColumnPostprocessor?, +): TypeInformation = + typeInformationWithPostprocessingFor( + targetSchema = ColumnSchema.Value(targetColumnType), + columnPostprocessor = columnPostprocessor, + ) + +public inline fun typeInformationWithPostprocessingForValueColumnOf( + isNullable: Boolean, + columnPostprocessor: DbColumnPostprocessor?, +): TypeInformation = + typeInformationWithPostprocessingForValueColumnOf( + targetColumnType = typeOf

().withNullability(isNullable), + columnPostprocessor = columnPostprocessor, + ) + +// endregion + +/** + * This preprocessor can be created for types where you want to convert the values + * coming from [java.sql.ResultSet.getObject] to a different type more suitable to be put in a [DataColumn] + * + * @param J the type of the value coming from the JDBC driver. + * @param D the type of the column values after preprocessing. + */ +public fun interface DbValuePreprocessor { + + /** + * Converts the given [jdbcValue]: [J] to a [D]. + * + * If you intend to create a [org.jetbrains.kotlinx.dataframe.columns.ColumnGroup], + * return a [org.jetbrains.kotlinx.dataframe.DataRow] here. + * + * If you intend to create a [org.jetbrains.kotlinx.dataframe.columns.FrameColumn], + * return a [org.jetbrains.kotlinx.dataframe.DataFrame] here. + */ + public fun preprocess(jdbcValue: J?, typeInformation: TypeInformation<@UnsafeVariance J, @UnsafeVariance D, *>): D? +} + +public fun DbValuePreprocessor<*, *>.cast(): DbValuePreprocessor = + this as DbValuePreprocessor + +public fun DbValuePreprocessor<*, *>.castToAny(): DbValuePreprocessor = cast() + +/** + * @param D the type of the column values before postprocessing. + * @param P the type of the column values after postprocessing. + */ +public fun interface DbColumnPostprocessor { + + /** + * Converts the given [column]: [DataColumn] with values of type [D] to a [DataColumn] of with values of type [P]. + */ + public fun postprocess( + column: DataColumn, + typeInformation: TypeInformation<*, @UnsafeVariance D, @UnsafeVariance P>, + ): DataColumn +} + +public fun DbColumnPostprocessor<*, *>.cast(): DbColumnPostprocessor = + this as DbColumnPostprocessor + +public fun DbColumnPostprocessor<*, *>.castToAny(): DbColumnPostprocessor = cast() diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt index 4e71d5010b..03c56f0af2 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt @@ -9,10 +9,11 @@ import org.jetbrains.kotlinx.dataframe.api.isFrameColumn import org.jetbrains.kotlinx.dataframe.api.isValueColumn import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.toDataFrame -import org.jetbrains.kotlinx.dataframe.io.db.AnyDbColumnTypeInformation +import org.jetbrains.kotlinx.dataframe.io.db.AnyTypeInformation import org.jetbrains.kotlinx.dataframe.io.db.DbType import org.jetbrains.kotlinx.dataframe.io.db.TableColumnMetadata import org.jetbrains.kotlinx.dataframe.io.db.cast +import org.jetbrains.kotlinx.dataframe.io.db.castToAny import org.jetbrains.kotlinx.dataframe.io.db.extractDBTypeFromConnection import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.Connection @@ -910,7 +911,7 @@ internal fun fetchAndConvertDataFromResultSet( private fun buildColumnTypeInformation( tableColumns: List, dbType: DbType, -): List = +): List = tableColumns.indices.map { index -> dbType.getOrGenerateTypeInformation(tableColumns[index]) } @@ -921,7 +922,7 @@ private fun buildColumnTypeInformation( private fun readAllRowsFromResultSet( rs: ResultSet, tableColumns: List, - columnTypeInformation: List, + columnTypeInformation: List, dbType: DbType, limit: Int?, ): List> { @@ -931,7 +932,7 @@ private fun readAllRowsFromResultSet( while (rs.next() && (limit == null || rowsRead < limit)) { repeat(columnsCount) { columnIndex -> - val typeInformation = columnTypeInformation[columnIndex].cast() + val typeInformation = columnTypeInformation[columnIndex].castToAny() val value = dbType.getValueFromResultSet( rs = rs, columnIndex = columnIndex, @@ -954,13 +955,13 @@ private fun readAllRowsFromResultSet( private fun buildDataFrameFromColumnData( columnData: List>, tableColumns: List, - columnTypeInformation: List, + columnTypeInformation: List, dbType: DbType, inferNullability: Boolean, checkSchema: Boolean = true, // TODO add as configurable parameter ): AnyFrame = columnData.mapIndexed { index, values -> - val typeInformation = columnTypeInformation[index].cast() + val typeInformation = columnTypeInformation[index].castToAny() val column = dbType.buildDataColumn( name = tableColumns[index].name, values = values, From 4d8acfb4098627c02cae2577dbb220d8b695c691 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 11 Dec 2025 16:43:33 +0100 Subject: [PATCH 07/14] fixed duckdb tests --- .../kotlinx/dataframe/io/db/DbType.kt | 2 + .../kotlinx/dataframe/io/db/DuckDb.kt | 14 ++- .../dataframe/io/commonTestScenarios.kt | 24 ++--- .../kotlinx/dataframe/io/local/duckDbTest.kt | 99 ++++++++++++------- 4 files changed, 87 insertions(+), 52 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index 0a78c714bf..ba4d37e40e 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -183,6 +183,8 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { ?: typeOf() } + // TODO add preprocessors for common types, like sql Arrays, Java datetimes, etc. + val postprocessor = when (tableColumnMetadata.jdbcType) { Types.ARRAY -> diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index 35b547924c..1706bde2a5 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -58,6 +58,7 @@ import java.util.Properties import kotlin.collections.toList import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType +import kotlin.reflect.full.withNullability import kotlin.time.Instant import kotlin.time.toKotlinInstant import kotlin.uuid.Uuid @@ -159,15 +160,14 @@ public object DuckDb : DbType("duckdb") { val (key, value) = parseMapTypes(sqlTypeName) val parsedKeyType = parseDuckDbType(key, false) - val parsedValueType = - parseDuckDbType(value, true).cast() + val parsedValueType = parseDuckDbType(value, true).castToAny() val targetMapType = Map::class.createType( listOf( KTypeProjection.invariant(parsedKeyType.targetSchema.type), KTypeProjection.invariant(parsedValueType.targetSchema.type), ), - ) + ).withNullability(isNullable) typeInformationWithPreprocessingForValueColumnOf, Map>( targetColumnType = targetMapType, @@ -186,8 +186,12 @@ public object DuckDb : DbType("duckdb") { parseDuckDbType(listType, true).castToAny() val targetListType = List::class.createType( - listOf(KTypeProjection.invariant(parsedListType.targetSchema.type)), - ) + listOf( + KTypeProjection.invariant( + parsedListType.targetSchema.type, + ), + ), + ).withNullability(isNullable) // todo maybe List should become FrameColumn typeInformationWithPreprocessingFor>( diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt index 2669f7e944..39c55f0ab7 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt @@ -1,5 +1,11 @@ package org.jetbrains.kotlinx.dataframe.io +import io.kotest.assertions.Actual +import io.kotest.assertions.AssertionFailedError +import io.kotest.assertions.Exceptions +import io.kotest.assertions.Expected +import io.kotest.assertions.failure +import io.kotest.assertions.print.printed import io.kotest.assertions.withClue import io.kotest.matchers.shouldBe import org.intellij.lang.annotations.Language @@ -144,17 +150,11 @@ internal fun inferNullability(connection: Connection) { */ @Suppress("INVISIBLE_REFERENCE") fun AnyFrame.assertInferredTypesMatchSchema() { - withClue({ - """ - |Inferred schema must be <: Provided schema - | - |Inferred Schema: - |${inferType().schema().toString().lines().joinToString("\n|")} - | - |Provided Schema: - |${schema().toString().lines().joinToString("\n|")} - """.trimMargin() - }) { - schema().compare(inferType().schema()).isSuperOrMatches() shouldBe true + if (!schema().compare(inferType().schema()).isSuperOrMatches()) { + throw failure( + expected = Expected(inferType().schema().toString().lines().sorted().joinToString("\n").printed()), + actual = Actual(schema().toString().lines().sorted().joinToString("\n").printed()), + prependMessage = "Inferred schema must be <: Provided schema", + ) } } diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt index 3cc00e15a3..a9c2430c20 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt @@ -2,7 +2,10 @@ package org.jetbrains.kotlinx.dataframe.io.local +import io.kotest.assertions.withClue import io.kotest.matchers.shouldBe +import kotlinx.datetime.LocalDate +import kotlinx.datetime.LocalTime import org.duckdb.DuckDBConnection import org.duckdb.DuckDBResultSet import org.duckdb.JsonNode @@ -36,11 +39,12 @@ import java.nio.file.Files import java.sql.Blob import java.sql.DriverManager import java.sql.Timestamp -import java.time.LocalDate -import java.time.LocalTime -import java.time.OffsetDateTime import java.util.UUID import kotlin.io.path.createTempDirectory +import kotlin.time.Instant +import kotlin.time.toKotlinInstant +import kotlin.uuid.Uuid +import java.time.OffsetDateTime as JavaOffsetDateTime private const val URL = "jdbc:duckdb:" @@ -58,10 +62,10 @@ class DuckDbTest { ) { companion object { val expected = listOf( - Person(1, "John Doe", 30, 50000.0, LocalDate.of(2020, 1, 15)), - Person(2, "Jane Smith", 28, 55000.0, LocalDate.of(2021, 3, 20)), - Person(3, "Bob Johnson", 35, 65000.0, LocalDate.of(2019, 11, 10)), - Person(4, "Alice Brown", 32, 60000.0, LocalDate.of(2020, 7, 1)), + Person(1, "John Doe", 30, 50000.0, LocalDate(2020, 1, 15)), + Person(2, "Jane Smith", 28, 55000.0, LocalDate(2021, 3, 20)), + Person(3, "Bob Johnson", 35, 65000.0, LocalDate(2019, 11, 10)), + Person(4, "Alice Brown", 32, 60000.0, LocalDate(2020, 7, 1)), ).toDataFrame() } } @@ -91,7 +95,7 @@ class DuckDbTest { @ColumnName("date_col") val dateCol: LocalDate, @ColumnName("datetime_col") - val datetimeCol: Timestamp, + val datetimeCol: Instant, @ColumnName("decimal_col") val decimalCol: BigDecimal, @ColumnName("double_col") @@ -151,11 +155,11 @@ class DuckDbTest { @ColumnName("time_col") val timeCol: LocalTime, @ColumnName("timestamp_col") - val timestampCol: Timestamp, + val timestampCol: Instant, @ColumnName("timestamptz_col") - val timestamptzCol: OffsetDateTime, + val timestamptzCol: JavaOffsetDateTime, @ColumnName("timestampwtz_col") - val timestampwtzCol: OffsetDateTime, + val timestampwtzCol: JavaOffsetDateTime, @ColumnName("tinyint_col") val tinyintCol: Byte, @ColumnName("ubigint_col") @@ -179,7 +183,7 @@ class DuckDbTest { @ColumnName("utinyint_col") val utinyintCol: Short, @ColumnName("uuid_col") - val uuidCol: UUID, + val uuidCol: Uuid, @ColumnName("varbinary_col") val varbinaryCol: Blob, @ColumnName("varchar_col") @@ -199,7 +203,7 @@ class DuckDbTest { byteaCol = DuckDBResultSet.DuckDBBlobResult(ByteBuffer.wrap("DEADBEEF".toByteArray())), charCol = "test", dateCol = LocalDate.parse("2025-06-19"), - datetimeCol = Timestamp.valueOf("2025-06-19 12:34:56"), + datetimeCol = Timestamp.valueOf("2025-06-19 12:34:56").toInstant().toKotlinInstant(), decimalCol = BigDecimal("123.45"), doubleCol = 3.14159, enumCol = "female", @@ -229,9 +233,9 @@ class DuckDbTest { stringCol = "test string", textCol = "test text", timeCol = LocalTime.parse("12:34:56"), - timestampCol = Timestamp.valueOf("2025-06-19 12:34:56"), - timestamptzCol = OffsetDateTime.parse("2025-06-19T12:34:56+02:00"), - timestampwtzCol = OffsetDateTime.parse("2025-06-19T12:34:56+02:00"), + timestampCol = Timestamp.valueOf("2025-06-19 12:34:56").toInstant().toKotlinInstant(), + timestamptzCol = JavaOffsetDateTime.parse("2025-06-19T12:34:56+02:00"), + timestampwtzCol = JavaOffsetDateTime.parse("2025-06-19T12:34:56+02:00"), tinyintCol = 127, ubigintCol = BigInteger("18446744073709551615"), uhugeintCol = BigInteger("340282366920938463463374607431768211455"), @@ -243,7 +247,7 @@ class DuckDbTest { uintCol = 4294967295L, usmallintCol = 65535, utinyintCol = 255, - uuidCol = UUID.fromString("a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11"), + uuidCol = Uuid.parse("a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11"), varbinaryCol = DuckDBResultSet.DuckDBBlobResult(ByteBuffer.wrap("DEADBEEF".toByteArray())), varcharCol = "test string", ), @@ -254,21 +258,21 @@ class DuckDbTest { @DataSchema data class NestedTypes( @ColumnName("ijstruct_col") - val ijstructCol: java.sql.Struct, + val ijstructCol: java.sql.Struct, // TODO @ColumnName("intarray_col") - val intarrayCol: java.sql.Array, + val intarrayCol: List, @ColumnName("intlist_col") - val intlistCol: java.sql.Array, + val intlistCol: List, @ColumnName("intstringmap_col") val intstringmapCol: Map, @ColumnName("intstrinstinggmap_col") val intstrinstinggmapCol: Map?>, @ColumnName("stringarray_col") - val stringarrayCol: java.sql.Array, + val stringarrayCol: List, @ColumnName("stringlist_col") - val stringlistCol: java.sql.Array, + val stringlistCol: List, @ColumnName("stringlistlist_col") - val stringlistlistCol: java.sql.Array, + val stringlistlistCol: List?>, @ColumnName("union_col") val unionCol: Any, ) @@ -310,7 +314,19 @@ class DuckDbTest { subset = DataFrame.readSqlQuery(connection, """SELECT test_table.name, test_table.age FROM test_table""") } - schema.compare(Person.expected.schema()).isSuperOrMatches() shouldBe true + withClue({ + """ + |Read schema must be <: expected schema + | + |Read Schema: + |${schema.toString().lines().joinToString("\n|")} + | + |expected Schema: + |${Person.expected.schema().toString().lines().joinToString("\n|")} + """.trimMargin() + }) { + schema.compare(Person.expected.schema()).isSuperOrMatches() shouldBe true + } df.cast(verify = true) shouldBe Person.expected df.assertInferredTypesMatchSchema() @@ -545,10 +561,24 @@ class DuckDbTest { df = DataFrame.readSqlTable(connection, "table1").reorderColumnsByName() } - schema.compare(GeneralPurposeTypes.expected.schema()).isSuperOrMatches() shouldBe true +// schema.toString().lines().sorted().joinToString("\n") shouldBe +// GeneralPurposeTypes.expected.schema().toString().lines().sorted().joinToString("\n") + withClue({ + """ + |Read schema must be <: expected schema + | + |Read Schema: + |${schema.toString().lines().joinToString("\n|")} + | + |expected Schema: + |${GeneralPurposeTypes.expected.schema().toString().lines().joinToString("\n|")} + """.trimMargin() + }) { + schema.compare(GeneralPurposeTypes.expected.schema()).isSuperOrMatches() shouldBe true + } // on some systems OffsetDateTime's get converted to UTC sometimes, let's compare them as Instant instead - fun AnyFrame.fixOffsetDateTime() = convert { colsOf() }.with { it.toInstant() } + fun AnyFrame.fixOffsetDateTime() = convert { colsOf() }.with { it.toInstant() } df.cast(verify = true).fixOffsetDateTime() shouldBe GeneralPurposeTypes.expected.fixOffsetDateTime() @@ -606,19 +636,18 @@ class DuckDbTest { df as DataFrame df.single().let { - it[{ "intarray_col"() }].array shouldBe arrayOf(1, 2, null) - it[{ "stringarray_col"() }].array shouldBe arrayOf("a", "ab", "abc") - it[{ "intlist_col"() }].array shouldBe arrayOf(1, 2, 3) - it[{ "stringlist_col"() }].array shouldBe arrayOf("a", "ab", "abc") - (it[{ "stringlistlist_col"() }].array as Array<*>) - .map { (it as java.sql.Array?)?.array } shouldBe listOf(arrayOf("a", "ab"), arrayOf("abc"), null) - it[{ "intstringmap_col">() }] shouldBe mapOf(1 to "value1", 200 to "value2") - it[{ "intstrinstinggmap_col">>() }] shouldBe mapOf( + it["intarray_col"] shouldBe listOf(1, 2, null) + it["stringarray_col"] shouldBe listOf("a", "ab", "abc") + it["intlist_col"] shouldBe listOf(1, 2, 3) + it["stringlist_col"] shouldBe listOf("a", "ab", "abc") + it["stringlistlist_col"] shouldBe listOf(listOf("a", "ab"), listOf("abc"), null) + it["intstringmap_col"] shouldBe mapOf(1 to "value1", 200 to "value2") + it["intstrinstinggmap_col"] shouldBe mapOf( 1 to mapOf("value1" to "a", "value2" to "b"), 200 to mapOf("value1" to "c", "value2" to "d"), ) it[{ "ijstruct_col"() }].attributes shouldBe arrayOf(42, "answer") - it[{ "union_col"() }] shouldBe 2 + it["union_col"] shouldBe 2 } } From 6539830faeb30003a695515a524bb847426d8473 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Fri, 12 Dec 2025 20:23:44 +0100 Subject: [PATCH 08/14] added jdbc source type parameter --- .../kotlinx/dataframe/io/db/DbType.kt | 3 +++ .../kotlinx/dataframe/io/db/DuckDb.kt | 20 ++++++++++++++++- .../dataframe/io/db/TypeInformation.kt | 22 +++++++++++++++---- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index ba4d37e40e..12243ebe79 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -196,6 +196,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { } return typeInformationWithPostprocessingFor( + jdbcSourceType = kType.withNullability(tableColumnMetadata.isNullable), targetSchema = ColumnSchema.Value(kType.withNullability(tableColumnMetadata.isNullable)), columnPostprocessor = postprocessor?.castToAny(), ) @@ -242,6 +243,8 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { type = schema.type, ) + // TODO, this should be postponed to post-processing. + // List.toDataFrame() is heavy! is ColumnSchema.Group -> DataColumn.createColumnGroup( name = name, diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index 1706bde2a5..efb0042b86 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -59,6 +59,7 @@ import kotlin.collections.toList import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf import kotlin.time.Instant import kotlin.time.toKotlinInstant import kotlin.uuid.Uuid @@ -170,6 +171,7 @@ public object DuckDb : DbType("duckdb") { ).withNullability(isNullable) typeInformationWithPreprocessingForValueColumnOf, Map>( + jdbcSourceType = typeOf>().withNullability(isNullable), // unused targetColumnType = targetMapType, ) { map, _ -> // only need to preprocess the values, as the keys are just Strings @@ -195,6 +197,7 @@ public object DuckDb : DbType("duckdb") { // todo maybe List should become FrameColumn typeInformationWithPreprocessingFor>( + jdbcSourceType = typeOf().withNullability(isNullable), targetSchema = ColumnSchema.Value(targetListType), ) { sqlArray, _ -> sqlArray @@ -204,7 +207,11 @@ public object DuckDb : DbType("duckdb") { } // TODO requires #1266 for specific types - STRUCT -> typeInformationForValueColumnOf(isNullable) + STRUCT -> { + val structTypes = parseStructType(sqlTypeName) + + typeInformationForValueColumnOf(isNullable) + } // Cannot handle this in Kotlin UNION -> typeInformationForValueColumnOf(isNullable) @@ -269,6 +276,17 @@ public object DuckDb : DbType("duckdb") { return typeString.take(typeString.indexOfLast { it == '[' }) } + /** Parses "STRUCT(v VARCHAR, i INTEGER)" into [("v", "VARCHAR"), ("i", "INTEGER")] */ + internal fun parseStructType(typeString: String): Map { + if (!typeString.startsWith("STRUCT(")) { + error("invalid STRUCT type: $typeString") + } + return typeString.removeSurrounding("STRUCT(", ")") + .split(",") + .map { it.trim().split(" ") } + .associate { (name, type) -> name to type } + } + /** * How to filter out system tables from user-created ones when using * [DataFrame.readAllSqlTables][DataFrame.Companion.readAllSqlTables] and diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt index e9faa9b998..f388042fad 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt @@ -28,6 +28,7 @@ public typealias AnyTypeInformation = TypeInformation<*, *, *> * to a [DataColumn] of with values of type [P]. */ public open class TypeInformation( + public open val jdbcSourceType: KType, public open val targetSchema: ColumnSchema, public open val valuePreprocessor: DbValuePreprocessor?, public open val columnPostprocessor: DbColumnPostprocessor?, @@ -55,38 +56,45 @@ public fun TypeInformation<*, *, *>.castToAny(): TypeInformation // region generic constructors public fun typeInformationWithProcessingFor( + jdbcSourceType: KType, targetSchema: ColumnSchema, valuePreprocessor: DbValuePreprocessor?, columnPostprocessor: DbColumnPostprocessor?, ): TypeInformation = TypeInformation( + jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, columnPostprocessor = columnPostprocessor, ) -public fun typeInformationFor(targetSchema: ColumnSchema): TypeInformation = +public fun typeInformationFor(jdbcSourceType: KType, targetSchema: ColumnSchema): TypeInformation = typeInformationWithProcessingFor( + jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = null, columnPostprocessor = null, ) public fun typeInformationWithPreprocessingFor( + jdbcSourceType: KType, targetSchema: ColumnSchema, valuePreprocessor: DbValuePreprocessor?, ): TypeInformation = typeInformationWithProcessingFor( + jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, columnPostprocessor = null, ) public fun typeInformationWithPostprocessingFor( + jdbcSourceType: KType, targetSchema: ColumnSchema, columnPostprocessor: DbColumnPostprocessor?, ): TypeInformation = typeInformationWithProcessingFor( + jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = null, columnPostprocessor = columnPostprocessor, @@ -97,43 +105,49 @@ public fun typeInformationWithPostprocessingFor( // region ValueColumn constructors public fun typeInformationForValueColumnOf(kType: KType): TypeInformation = - typeInformationFor(targetSchema = ColumnSchema.Value(kType)) + typeInformationFor(jdbcSourceType = kType, targetSchema = ColumnSchema.Value(kType)) public inline fun typeInformationForValueColumnOf(isNullable: Boolean): TypeInformation = typeInformationForValueColumnOf(typeOf().withNullability(isNullable)) public fun typeInformationWithPreprocessingForValueColumnOf( + jdbcSourceType: KType, targetColumnType: KType, valuePreprocessor: DbValuePreprocessor?, ): TypeInformation = typeInformationWithPreprocessingFor( + jdbcSourceType = jdbcSourceType, targetSchema = ColumnSchema.Value(targetColumnType), valuePreprocessor = valuePreprocessor, ) -public inline fun typeInformationWithPreprocessingForValueColumnOf( +public inline fun typeInformationWithPreprocessingForValueColumnOf( isNullable: Boolean, valuePreprocessor: DbValuePreprocessor?, ): TypeInformation = typeInformationWithPreprocessingForValueColumnOf( + jdbcSourceType = typeOf().withNullability(isNullable), targetColumnType = typeOf().withNullability(isNullable), valuePreprocessor = valuePreprocessor, ) public fun typeInformationWithPostprocessingForValueColumnOf( + jdbcSourceType: KType, targetColumnType: KType, columnPostprocessor: DbColumnPostprocessor?, ): TypeInformation = typeInformationWithPostprocessingFor( + jdbcSourceType = jdbcSourceType, targetSchema = ColumnSchema.Value(targetColumnType), columnPostprocessor = columnPostprocessor, ) -public inline fun typeInformationWithPostprocessingForValueColumnOf( +public inline fun typeInformationWithPostprocessingForValueColumnOf( isNullable: Boolean, columnPostprocessor: DbColumnPostprocessor?, ): TypeInformation = typeInformationWithPostprocessingForValueColumnOf( + jdbcSourceType = typeOf().withNullability(isNullable), targetColumnType = typeOf

().withNullability(isNullable), columnPostprocessor = columnPostprocessor, ) From e114fca61fb1cde355c6b777736e19f7eb2c6f83 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 15 Dec 2025 14:22:53 +0100 Subject: [PATCH 09/14] struct parsing for duckdb working! --- .../kotlinx/dataframe/io/db/DuckDb.kt | 78 +++++++++++++++++-- .../kotlinx/dataframe/io/local/duckDbTest.kt | 8 +- 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index efb0042b86..915092a17c 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -42,7 +42,21 @@ import org.duckdb.DuckDBColumnType.UUID import org.duckdb.DuckDBColumnType.VARCHAR import org.duckdb.DuckDBResultSetMetaData import org.duckdb.JsonNode +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow +import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.asColumnGroup +import org.jetbrains.kotlinx.dataframe.api.asDataColumn +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.castToNotNullable +import org.jetbrains.kotlinx.dataframe.api.first +import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.impl.DataCollector +import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema @@ -56,6 +70,7 @@ import java.sql.ResultSet import java.sql.Struct import java.util.Properties import kotlin.collections.toList +import kotlin.reflect.KClass import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType import kotlin.reflect.full.withNullability @@ -100,7 +115,7 @@ public object DuckDb : DbType("duckdb") { */ internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyTypeInformation = duckDbTypeCache.getOrPut(Pair(sqlTypeName, isNullable)) { - when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { + return@getOrPut when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { BOOLEAN -> typeInformationForValueColumnOf(isNullable) TINYINT -> typeInformationForValueColumnOf(isNullable) @@ -182,7 +197,6 @@ public object DuckDb : DbType("duckdb") { } LIST, ARRAY -> { - // TODO requires #1266 and #1273 for specific types val listType = parseListType(sqlTypeName) val parsedListType = parseDuckDbType(listType, true).castToAny() @@ -206,11 +220,46 @@ public object DuckDb : DbType("duckdb") { } } - // TODO requires #1266 for specific types STRUCT -> { - val structTypes = parseStructType(sqlTypeName) + val structEntries = parseStructType(sqlTypeName) + val parsedStructEntries = structEntries.mapValues { (_, type) -> + parseDuckDbType(sqlTypeName = type, isNullable = true) + } - typeInformationForValueColumnOf(isNullable) + val targetSchema = ColumnSchema.Group( + schema = DataFrameSchemaImpl(parsedStructEntries.mapValues { it.value.targetSchema }), + contentType = typeOf(), + ) + + typeInformationWithProcessingFor, DataRow<*>>( + jdbcSourceType = typeOf().withNullability(isNullable), + targetSchema = targetSchema, + valuePreprocessor = { struct, _ -> + // NOTE DataRows cannot be `null` in DataFrame, instead, all its fields become `null` + if (struct == null) { + parsedStructEntries.mapValues { null } + } else { + // read data from the struct + val attrs = struct.getAttributes( + parsedStructEntries.mapValues { + (it.value.jdbcSourceType.classifier!! as KClass<*>).java + }, + ) + + // and potentially, preprocess each value individually + parsedStructEntries.entries.withIndex().associate { (i, entry) -> + entry.key to entry.value.castToAny().preprocess(attrs[i]) + } + } + }, + columnPostprocessor = { col, _ -> + col.castToNotNullable() + .values() + .toDataFrame() + .asColumnGroup(col.name()) + .asDataColumn() + }, + ) } // Cannot handle this in Kotlin @@ -222,6 +271,25 @@ public object DuckDb : DbType("duckdb") { } } + // Overriding buildDataColumn behavior so we can create the column group in post-processing for efficiency + override fun buildDataColumn( + name: String, + values: List, + typeInformation: TypeInformation<*, D, *>, + inferNullability: Boolean, + ): DataColumn = + when (val schema = typeInformation.targetSchema) { + is ColumnSchema.Group -> + DataColumn.createValueColumn( + name = name, + values = values, + infer = if (inferNullability) Infer.Nulls else Infer.None, + type = schema.type, + ) + + else -> super.buildDataColumn(name, values, typeInformation, inferNullability) + } + private fun SqlArray.toList(): List = when (val array = this.array) { is IntArray -> array.toList() diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt index a9c2430c20..085d519220 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt @@ -255,10 +255,13 @@ class DuckDbTest { } } + @DataSchema + data class NestedEntry(val i: Int, val j: String) + @DataSchema data class NestedTypes( @ColumnName("ijstruct_col") - val ijstructCol: java.sql.Struct, // TODO + val ijstructCol: NestedEntry, // TODO @ColumnName("intarray_col") val intarrayCol: List, @ColumnName("intlist_col") @@ -646,7 +649,8 @@ class DuckDbTest { 1 to mapOf("value1" to "a", "value2" to "b"), 200 to mapOf("value1" to "c", "value2" to "d"), ) - it[{ "ijstruct_col"() }].attributes shouldBe arrayOf(42, "answer") + it[{ "ijstruct_col"["i"]() }] shouldBe 42 + it[{ "ijstruct_col"["j"]() }] shouldBe "answer" it["union_col"] shouldBe 2 } } From 1a9f5818f93fae06004289b225c647a8833b0d05 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 16 Dec 2025 12:21:20 +0100 Subject: [PATCH 10/14] merging column creation and post processing dbType --- .../kotlinx/dataframe/io/db/DbType.kt | 60 ++++----------- .../kotlinx/dataframe/io/db/DuckDb.kt | 38 +-------- .../dataframe/io/db/TypeInformation.kt | 77 +++++++++++++------ .../kotlinx/dataframe/io/readJdbc.kt | 5 +- 4 files changed, 73 insertions(+), 107 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index 12243ebe79..efb7d318aa 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -1,16 +1,8 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.AnyFrame -import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.api.Infer -import org.jetbrains.kotlinx.dataframe.api.asDataColumn -import org.jetbrains.kotlinx.dataframe.api.asValueColumn -import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.schema -import org.jetbrains.kotlinx.dataframe.api.toColumn -import org.jetbrains.kotlinx.dataframe.api.toDataFrame -import org.jetbrains.kotlinx.dataframe.columns.ValueColumn import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema @@ -188,8 +180,13 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { val postprocessor = when (tableColumnMetadata.jdbcType) { Types.ARRAY -> - DbColumnPostprocessor, Any> { column, _ -> - handleArrayValues(column.asValueColumn()) + DbColumnBuilder, Any> { name, values, typeInformation, inferNullability -> + DataColumn.createValueColumn( + name = name, + values = handleArrayValues(values), + infer = if (inferNullability) Infer.Nulls else Infer.None, + type = typeInformation.targetSchema.type, + ) } else -> null @@ -198,7 +195,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { return typeInformationWithPostprocessingFor( jdbcSourceType = kType.withNullability(tableColumnMetadata.isNullable), targetSchema = ColumnSchema.Value(kType.withNullability(tableColumnMetadata.isNullable)), - columnPostprocessor = postprocessor?.castToAny(), + columnBuilder = postprocessor?.castToAny(), ) } @@ -228,41 +225,12 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { typeInformation: TypeInformation, ): D? = typeInformation.preprocess(value) - public open fun buildDataColumn( + public open fun buildDataColumn( name: String, values: List, - typeInformation: TypeInformation<*, D, *>, - inferNullability: Boolean, - ): DataColumn = - when (val schema = typeInformation.targetSchema) { - is ColumnSchema.Value -> - DataColumn.createValueColumn( - name = name, - values = values, - infer = if (inferNullability) Infer.Nulls else Infer.None, - type = schema.type, - ) - - // TODO, this should be postponed to post-processing. - // List.toDataFrame() is heavy! - is ColumnSchema.Group -> - DataColumn.createColumnGroup( - name = name, - df = (values as List).toDataFrame(), - ).asDataColumn().cast() - - is ColumnSchema.Frame -> - DataColumn.createFrameColumn( - name = name, - groups = values as List, - schema = lazy { schema.schema }, - ).cast() - } - - public fun postProcessDataColumn( - column: DataColumn, typeInformation: TypeInformation<*, D, P>, - ): DataColumn = typeInformation.postprocess(column) + inferNullability: Boolean, + ): DataColumn = typeInformation.buildDataColumn(name, values, inferNullability) /** * Checks if the given table name is a system table for the specified database type. @@ -531,9 +499,9 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { * @param values raw values containing SQL Array objects * @return list of consistently typed arrays, or original arrays if no common type exists */ - private fun handleArrayValues(values: ValueColumn): DataColumn { + private fun handleArrayValues(values: List): List { // Intermediate variable for the first mapping - val sqlArrays = values.values().mapNotNull { + val sqlArrays = values.mapNotNull { (it as? java.sql.Array)?.array?.let { array -> array as? Array<*> } } @@ -553,7 +521,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { sqlArrays.map { castArray(it, commonElementType).toTypedArray() } } else { sqlArrays - }.toColumn(values.name()) + } } /** Utility function to cast arrays based on the type of elements */ diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index 915092a17c..8e7e21c65b 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -42,20 +42,11 @@ import org.duckdb.DuckDBColumnType.UUID import org.duckdb.DuckDBColumnType.VARCHAR import org.duckdb.DuckDBResultSetMetaData import org.duckdb.JsonNode -import org.jetbrains.kotlinx.dataframe.AnyFrame -import org.jetbrains.kotlinx.dataframe.AnyRow -import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow -import org.jetbrains.kotlinx.dataframe.api.Infer import org.jetbrains.kotlinx.dataframe.api.asColumnGroup import org.jetbrains.kotlinx.dataframe.api.asDataColumn -import org.jetbrains.kotlinx.dataframe.api.cast -import org.jetbrains.kotlinx.dataframe.api.castToNotNullable -import org.jetbrains.kotlinx.dataframe.api.first import org.jetbrains.kotlinx.dataframe.api.toDataFrame -import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup -import org.jetbrains.kotlinx.dataframe.impl.DataCollector import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables @@ -69,7 +60,6 @@ import java.sql.DriverManager import java.sql.ResultSet import java.sql.Struct import java.util.Properties -import kotlin.collections.toList import kotlin.reflect.KClass import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType @@ -115,7 +105,7 @@ public object DuckDb : DbType("duckdb") { */ internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyTypeInformation = duckDbTypeCache.getOrPut(Pair(sqlTypeName, isNullable)) { - return@getOrPut when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { + when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { BOOLEAN -> typeInformationForValueColumnOf(isNullable) TINYINT -> typeInformationForValueColumnOf(isNullable) @@ -252,11 +242,10 @@ public object DuckDb : DbType("duckdb") { } } }, - columnPostprocessor = { col, _ -> - col.castToNotNullable() - .values() + columnBuilder = { name, values, _, _ -> + (values as List>) .toDataFrame() - .asColumnGroup(col.name()) + .asColumnGroup(name) .asDataColumn() }, ) @@ -271,25 +260,6 @@ public object DuckDb : DbType("duckdb") { } } - // Overriding buildDataColumn behavior so we can create the column group in post-processing for efficiency - override fun buildDataColumn( - name: String, - values: List, - typeInformation: TypeInformation<*, D, *>, - inferNullability: Boolean, - ): DataColumn = - when (val schema = typeInformation.targetSchema) { - is ColumnSchema.Group -> - DataColumn.createValueColumn( - name = name, - values = values, - infer = if (inferNullability) Infer.Nulls else Infer.None, - type = schema.type, - ) - - else -> super.buildDataColumn(name, values, typeInformation, inferNullability) - } - private fun SqlArray.toList(): List = when (val array = this.array) { is IntArray -> array.toList() diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt index f388042fad..6791e7666f 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt @@ -1,7 +1,12 @@ package org.jetbrains.kotlinx.dataframe.io.db +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.asDataColumn import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.toDataFrame import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import kotlin.reflect.KType import kotlin.reflect.full.withNullability @@ -18,20 +23,20 @@ public typealias AnyTypeInformation = TypeInformation<*, *, *> * * @param J the type of the value coming from the JDBC driver. * @param D the type of the column values after preprocessing. Will be equal to [J] if [valuePreprocessor] is `null`. - * @param P the type of the column values after postprocessing. Will be equal to [D] if [columnPostprocessor] is `null`. + * @param P the type of the column values after postprocessing. Will be equal to [D] if [columnBuilder] is `null`. * * @property targetSchema the target schema of the column after running the optional - * [valuePreprocessor] and [columnPostprocessor]. + * [valuePreprocessor] and [columnBuilder]. * @property valuePreprocessor an optional function that converts values from [java.sql.ResultSet.getObject] * to a cell/row suitable to be put into a [DataColumn]. - * @property columnPostprocessor an optional function that converts a [DataColumn] with values of type [D] + * @property columnBuilder an optional function that converts a [List] with values of type [D] * to a [DataColumn] of with values of type [P]. */ public open class TypeInformation( public open val jdbcSourceType: KType, public open val targetSchema: ColumnSchema, public open val valuePreprocessor: DbValuePreprocessor?, - public open val columnPostprocessor: DbColumnPostprocessor?, + public open val columnBuilder: DbColumnBuilder?, ) { public open fun preprocess(value: J?): D? { valuePreprocessor?.let { valuePreprocessor -> @@ -40,11 +45,34 @@ public open class TypeInformation( return value as D? } - public open fun postprocess(column: DataColumn): DataColumn { - columnPostprocessor?.let { columnPostprocessor -> - return columnPostprocessor.postprocess(column, this) + public open fun buildDataColumn(name: String, values: List, inferNullability: Boolean): DataColumn { + columnBuilder?.let { columnPostprocessor -> + return columnPostprocessor.buildDataColumn(name, values, this, inferNullability) + } + return when (val schema = targetSchema) { + is ColumnSchema.Value -> + DataColumn.createValueColumn( + name = name, + values = values, + infer = if (inferNullability) Infer.Nulls else Infer.None, + type = schema.type, + ).cast() + + // TODO, this case should be avoided. + // Creating `n` DataRows is heavy! + is ColumnSchema.Group -> + DataColumn.createColumnGroup( + name = name, + df = (values as List).toDataFrame(), + ).asDataColumn().cast() + + is ColumnSchema.Frame -> + DataColumn.createFrameColumn( + name = name, + groups = values as List, + schema = lazy { schema.schema }, + ).cast() } - return column.cast() } } @@ -59,13 +87,13 @@ public fun typeInformationWithProcessingFor( jdbcSourceType: KType, targetSchema: ColumnSchema, valuePreprocessor: DbValuePreprocessor?, - columnPostprocessor: DbColumnPostprocessor?, + columnBuilder: DbColumnBuilder?, ): TypeInformation = TypeInformation( jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, - columnPostprocessor = columnPostprocessor, + columnBuilder = columnBuilder, ) public fun typeInformationFor(jdbcSourceType: KType, targetSchema: ColumnSchema): TypeInformation = @@ -73,7 +101,7 @@ public fun typeInformationFor(jdbcSourceType: KType, targetSchema: Col jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = null, - columnPostprocessor = null, + columnBuilder = null, ) public fun typeInformationWithPreprocessingFor( @@ -85,19 +113,19 @@ public fun typeInformationWithPreprocessingFor( jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, - columnPostprocessor = null, + columnBuilder = null, ) public fun typeInformationWithPostprocessingFor( jdbcSourceType: KType, targetSchema: ColumnSchema, - columnPostprocessor: DbColumnPostprocessor?, + columnBuilder: DbColumnBuilder?, ): TypeInformation = typeInformationWithProcessingFor( jdbcSourceType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = null, - columnPostprocessor = columnPostprocessor, + columnBuilder = columnBuilder, ) // endregion @@ -134,17 +162,17 @@ public inline fun typeInformationWithPreproce public fun typeInformationWithPostprocessingForValueColumnOf( jdbcSourceType: KType, targetColumnType: KType, - columnPostprocessor: DbColumnPostprocessor?, + columnPostprocessor: DbColumnBuilder?, ): TypeInformation = typeInformationWithPostprocessingFor( jdbcSourceType = jdbcSourceType, targetSchema = ColumnSchema.Value(targetColumnType), - columnPostprocessor = columnPostprocessor, + columnBuilder = columnPostprocessor, ) public inline fun typeInformationWithPostprocessingForValueColumnOf( isNullable: Boolean, - columnPostprocessor: DbColumnPostprocessor?, + columnPostprocessor: DbColumnBuilder?, ): TypeInformation = typeInformationWithPostprocessingForValueColumnOf( jdbcSourceType = typeOf().withNullability(isNullable), @@ -184,18 +212,19 @@ public fun DbValuePreprocessor<*, *>.castToAny(): DbValuePreprocessor * @param D the type of the column values before postprocessing. * @param P the type of the column values after postprocessing. */ -public fun interface DbColumnPostprocessor { +public fun interface DbColumnBuilder { /** - * Converts the given [column]: [DataColumn] with values of type [D] to a [DataColumn] of with values of type [P]. + * Converts the given [values]: [DataColumn] with values of type [D] to a [DataColumn] of with values of type [P]. */ - public fun postprocess( - column: DataColumn, + public fun buildDataColumn( + name: String, + values: List, typeInformation: TypeInformation<*, @UnsafeVariance D, @UnsafeVariance P>, + inferNullability: Boolean, ): DataColumn } -public fun DbColumnPostprocessor<*, *>.cast(): DbColumnPostprocessor = - this as DbColumnPostprocessor +public fun DbColumnBuilder<*, *>.cast(): DbColumnBuilder = this as DbColumnBuilder -public fun DbColumnPostprocessor<*, *>.castToAny(): DbColumnPostprocessor = cast() +public fun DbColumnBuilder<*, *>.castToAny(): DbColumnBuilder = cast() diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt index 03c56f0af2..35aeb33c48 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt @@ -968,13 +968,12 @@ private fun buildDataFrameFromColumnData( typeInformation = typeInformation, inferNullability = inferNullability, ) - val postProcessedColumn = dbType.postProcessDataColumn(column, typeInformation) if (checkSchema) { - postProcessedColumn.checkSchema(typeInformation.targetSchema) + column.checkSchema(typeInformation.targetSchema) } - postProcessedColumn + column }.toDataFrame() private fun AnyCol.checkSchema(expected: ColumnSchema) { From 6bf27fb35c2f764408e6375150d5a56e926ec419 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 16 Dec 2025 18:24:44 +0100 Subject: [PATCH 11/14] created AdvancedDbType so we can have "simple" and "advanced" db types, that use JdbcTypeMapping --- .../kotlinx/dataframe/io/db/AdvancedDbType.kt | 59 ++++++++ .../kotlinx/dataframe/io/db/DbType.kt | 123 ++++++++++------ .../kotlinx/dataframe/io/db/DuckDb.kt | 61 ++++---- .../jetbrains/kotlinx/dataframe/io/db/H2.kt | 7 +- ...{TypeInformation.kt => JdbcTypeMapping.kt} | 131 +++++++++--------- .../kotlinx/dataframe/io/db/MariaDb.kt | 10 +- .../kotlinx/dataframe/io/db/MySql.kt | 8 +- .../kotlinx/dataframe/io/db/PostgreSql.kt | 8 +- .../dataframe/io/readDataFrameSchema.kt | 28 +--- .../kotlinx/dataframe/io/readJdbc.kt | 131 ++++++++++++------ .../kotlinx/dataframe/io/local/duckDbTest.kt | 6 +- 11 files changed, 350 insertions(+), 222 deletions(-) create mode 100644 dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt rename dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/{TypeInformation.kt => JdbcTypeMapping.kt} (65%) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt new file mode 100644 index 0000000000..acba290819 --- /dev/null +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt @@ -0,0 +1,59 @@ +package org.jetbrains.kotlinx.dataframe.io.db + +import org.jetbrains.kotlinx.dataframe.DataColumn +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema +import kotlin.reflect.KType + +/** + * Alternative version of [DbType] that allows to customize type mapping + * by initializing a [JdbcTypeMapping] instance for each JDBC type. + * + * This can be helpful for JDBC databases that support structured data, like [DuckDb] + * or that need to a lot of type mapping. + */ +public abstract class AdvancedDbType(dbTypeInJdbcUrl: String) : DbType(dbTypeInJdbcUrl) { + + protected abstract fun generateTypeMapping(tableColumnMetadata: TableColumnMetadata): AnyJdbcTypeMapping + + private val typeMappingCache = mutableMapOf() + + protected fun getTypeMapping(tableColumnMetadata: TableColumnMetadata): AnyJdbcTypeMapping = + typeMappingCache.getOrPut(tableColumnMetadata) { + generateTypeMapping(tableColumnMetadata) + } + + final override fun getExpectedJdbcType(tableColumnMetadata: TableColumnMetadata): KType = + getTypeMapping(tableColumnMetadata).expectedJdbcType + + final override fun getPreprocessedValueType( + tableColumnMetadata: TableColumnMetadata, + expectedJdbcType: KType, + ): KType = getTypeMapping(tableColumnMetadata).preprocessedValueType + + final override fun getTargetColumnSchema( + tableColumnMetadata: TableColumnMetadata, + expectedValueType: KType, + ): ColumnSchema = getTypeMapping(tableColumnMetadata).targetSchema + + final override fun preprocessValue( + value: J?, + tableColumnMetadata: TableColumnMetadata, + expectedJdbcType: KType, + expectedPreprocessedValueType: KType, + ): D? = getTypeMapping(tableColumnMetadata).cast().preprocessOrCast(value) + + final override fun buildDataColumn( + name: String, + values: List, + tableColumnMetadata: TableColumnMetadata, + targetColumnSchema: ColumnSchema, + inferNullability: Boolean, + ): DataColumn = + getTypeMapping(tableColumnMetadata).cast() + .buildDataColumnOrNull(name, values, inferNullability) + ?: values.toDataColumn( + name = name, + targetColumnSchema = targetColumnSchema, + inferNullability = inferNullability, + ) +} diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index efb7d318aa..19fbd7ccf7 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -1,8 +1,13 @@ package org.jetbrains.kotlinx.dataframe.io.db +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.asDataColumn +import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.toDataFrame import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema @@ -126,21 +131,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { Types.TIMESTAMP_WITH_TIMEZONE to typeOf(), ) - private val typeInformationCache = mutableMapOf() - - /** - * Returns a [TypeInformation] produced from [tableColumnMetadata]. - */ - public fun getOrGenerateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation = - typeInformationCache.getOrPut(tableColumnMetadata) { generateTypeInformation(tableColumnMetadata) } - - /** - * Returns a [TypeInformation] produced from [tableColumnMetadata]. - * - * This function can be overridden by returning your own [TypeInformation] or a subtype of that. - * Do note that this class needs to be stateless, so this function can be memoized. - */ - public open fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { + public open fun getExpectedJdbcType(tableColumnMetadata: TableColumnMetadata): KType { val kType = when { tableColumnMetadata.jdbcType == Types.OTHER -> when (tableColumnMetadata.javaClassName) { @@ -175,28 +166,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { ?: typeOf() } - // TODO add preprocessors for common types, like sql Arrays, Java datetimes, etc. - - val postprocessor = - when (tableColumnMetadata.jdbcType) { - Types.ARRAY -> - DbColumnBuilder, Any> { name, values, typeInformation, inferNullability -> - DataColumn.createValueColumn( - name = name, - values = handleArrayValues(values), - infer = if (inferNullability) Infer.Nulls else Infer.None, - type = typeInformation.targetSchema.type, - ) - } - - else -> null - } - - return typeInformationWithPostprocessingFor( - jdbcSourceType = kType.withNullability(tableColumnMetadata.isNullable), - targetSchema = ColumnSchema.Value(kType.withNullability(tableColumnMetadata.isNullable)), - columnBuilder = postprocessor?.castToAny(), - ) + return kType.withNullability(tableColumnMetadata.isNullable) } /** @@ -205,13 +175,13 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { * * @param [rs] the ResultSet to read from * @param [columnIndex] zero-based column index - * @param [typeInformation] * @return the extracted value, or null */ public open fun getValueFromResultSet( rs: ResultSet, columnIndex: Int, - typeInformation: TypeInformation, + tableColumnMetadata: TableColumnMetadata, + expectedJdbcType: KType, ): J? = try { rs.getObject(columnIndex + 1) @@ -220,17 +190,80 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { rs.getString(columnIndex + 1) } as J? - public fun preprocessValuesFromResultSet( + // TODO add preprocessors for common types, like sql Arrays, Java datetimes, etc. + public open fun getPreprocessedValueType( + tableColumnMetadata: TableColumnMetadata, + expectedJdbcType: KType, + ): KType = expectedJdbcType + + // TODO add preprocessors for common types, like sql Arrays, Java datetimes, etc. + public open fun preprocessValue( value: J?, - typeInformation: TypeInformation, - ): D? = typeInformation.preprocess(value) + tableColumnMetadata: TableColumnMetadata, + expectedJdbcType: KType, + expectedPreprocessedValueType: KType, + ): D? = value as D? + + public open fun getTargetColumnSchema( + tableColumnMetadata: TableColumnMetadata, + expectedValueType: KType, + ): ColumnSchema = + when (tableColumnMetadata.jdbcType) { + // buildDataColumn post-processes java.sql.Array -> Kotlin arrays, making the result type `Array<*>` + Types.ARRAY -> ColumnSchema.Value(typeOf>().withNullability(expectedValueType.isMarkedNullable)) + + else -> ColumnSchema.Value(expectedValueType) + } public open fun buildDataColumn( name: String, values: List, - typeInformation: TypeInformation<*, D, P>, + tableColumnMetadata: TableColumnMetadata, + targetColumnSchema: ColumnSchema, inferNullability: Boolean, - ): DataColumn = typeInformation.buildDataColumn(name, values, inferNullability) + ): DataColumn { + val postProcessedValues = when (tableColumnMetadata.jdbcType) { + // Special case which post-processes java.sql.Array -> Kotlin arrays + Types.ARRAY -> handleArrayValues(values) + + else -> values + } + return postProcessedValues.toDataColumn( + name = name, + targetColumnSchema = targetColumnSchema, + inferNullability = inferNullability, + ) + } + + protected fun List.toDataColumn( + name: String, + targetColumnSchema: ColumnSchema, + inferNullability: Boolean, + ): DataColumn = + when (targetColumnSchema) { + is ColumnSchema.Value -> + DataColumn.createValueColumn( + name = name, + values = this, + infer = if (inferNullability) Infer.Nulls else Infer.None, + type = targetColumnSchema.type, + ).cast() + + // NOTE: this case should be avoided. + // Creating `n` DataRows is heavy! + is ColumnSchema.Group -> + DataColumn.createColumnGroup( + name = name, + df = (this as List).toDataFrame(), + ).asDataColumn().cast() + + is ColumnSchema.Frame -> + DataColumn.createFrameColumn( + name = name, + groups = this as List, + schema = lazy { targetColumnSchema.schema }, + ).cast() + } /** * Checks if the given table name is a system table for the specified database type. @@ -499,7 +532,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { * @param values raw values containing SQL Array objects * @return list of consistently typed arrays, or original arrays if no common type exists */ - private fun handleArrayValues(values: List): List { + private fun handleArrayValues(values: List): List?> { // Intermediate variable for the first mapping val sqlArrays = values.mapNotNull { (it as? java.sql.Array)?.array?.let { array -> array as? Array<*> } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index 8e7e21c65b..3e164e03db 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -85,15 +85,15 @@ private val logger = KotlinLogging.logger {} * This class provides methods to convert data from a [ResultSet] to the appropriate type for DuckDB, * and to generate the corresponding [column schema][ColumnSchema]. */ -public object DuckDb : DbType("duckdb") { +public object DuckDb : AdvancedDbType("duckdb") { /** the name of the class of the DuckDB JDBC driver */ override val driverClassName: String = "org.duckdb.DuckDBDriver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation = + override fun generateTypeMapping(tableColumnMetadata: TableColumnMetadata): AnyJdbcTypeMapping = parseDuckDbType(tableColumnMetadata.sqlTypeName, tableColumnMetadata.isNullable) - private val duckDbTypeCache = mutableMapOf, AnyTypeInformation>() + private val duckDbTypeCache = mutableMapOf, AnyJdbcTypeMapping>() /** * How a column type from JDBC, [sqlTypeName], is read in Java/Kotlin. @@ -103,7 +103,7 @@ public object DuckDb : DbType("duckdb") { * Following [org.duckdb.DuckDBVector.getObject] and converting the result to * */ - internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyTypeInformation = + internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyJdbcTypeMapping = duckDbTypeCache.getOrPut(Pair(sqlTypeName, isNullable)) { when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { BOOLEAN -> typeInformationForValueColumnOf(isNullable) @@ -134,21 +134,22 @@ public object DuckDb : DbType("duckdb") { DECIMAL -> typeInformationForValueColumnOf(isNullable) - TIME -> typeInformationWithPreprocessingForValueColumnOf( - isNullable = isNullable, - ) { it, _ -> it?.toKotlinLocalTime() } + TIME -> + typeInformationWithPreprocessingForValueColumnOf(isNullable) { + it?.toKotlinLocalTime() + } // todo? TIME_WITH_TIME_ZONE -> typeInformationForValueColumnOf(isNullable) - DATE -> typeInformationWithPreprocessingForValueColumnOf( - isNullable = isNullable, - ) { it, _ -> it?.toKotlinLocalDate() } + DATE -> typeInformationWithPreprocessingForValueColumnOf(isNullable) { + it?.toKotlinLocalDate() + } TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> - typeInformationWithPreprocessingForValueColumnOf( - isNullable = isNullable, - ) { it, _ -> it?.toInstant()?.toKotlinInstant() } + typeInformationWithPreprocessingForValueColumnOf(isNullable) { + it?.toInstant()?.toKotlinInstant() + } // todo? TIMESTAMP_WITH_TIME_ZONE -> typeInformationForValueColumnOf(isNullable) @@ -158,9 +159,9 @@ public object DuckDb : DbType("duckdb") { BLOB -> typeInformationForValueColumnOf(isNullable) - UUID -> typeInformationWithPreprocessingForValueColumnOf( - isNullable = isNullable, - ) { it, _ -> it?.toKotlinUuid() } + UUID -> typeInformationWithPreprocessingForValueColumnOf(isNullable) { + it?.toKotlinUuid() + } MAP -> { val (key, value) = parseMapTypes(sqlTypeName) @@ -176,12 +177,12 @@ public object DuckDb : DbType("duckdb") { ).withNullability(isNullable) typeInformationWithPreprocessingForValueColumnOf, Map>( - jdbcSourceType = typeOf>().withNullability(isNullable), // unused - targetColumnType = targetMapType, - ) { map, _ -> + isNullable = isNullable, + preprocessedValueType = targetMapType, + ) { map -> // only need to preprocess the values, as the keys are just Strings map?.mapValues { (_, value) -> - parsedValueType.preprocess(value) + parsedValueType.preprocessOrCast(value) } } } @@ -200,13 +201,13 @@ public object DuckDb : DbType("duckdb") { ).withNullability(isNullable) // todo maybe List should become FrameColumn - typeInformationWithPreprocessingFor>( - jdbcSourceType = typeOf().withNullability(isNullable), - targetSchema = ColumnSchema.Value(targetListType), - ) { sqlArray, _ -> + typeInformationWithPreprocessingForValueColumnOf>( + isNullable = isNullable, + preprocessedValueType = targetListType, + ) { sqlArray -> sqlArray ?.toList() - ?.map(parsedListType::preprocess) // recursively preprocess + ?.map { parsedListType.preprocessOrCast(it) } // recursively preprocess } } @@ -222,9 +223,9 @@ public object DuckDb : DbType("duckdb") { ) typeInformationWithProcessingFor, DataRow<*>>( - jdbcSourceType = typeOf().withNullability(isNullable), + isNullable = isNullable, targetSchema = targetSchema, - valuePreprocessor = { struct, _ -> + valuePreprocessor = { struct -> // NOTE DataRows cannot be `null` in DataFrame, instead, all its fields become `null` if (struct == null) { parsedStructEntries.mapValues { null } @@ -232,17 +233,17 @@ public object DuckDb : DbType("duckdb") { // read data from the struct val attrs = struct.getAttributes( parsedStructEntries.mapValues { - (it.value.jdbcSourceType.classifier!! as KClass<*>).java + (it.value.expectedJdbcType.classifier!! as KClass<*>).java }, ) // and potentially, preprocess each value individually parsedStructEntries.entries.withIndex().associate { (i, entry) -> - entry.key to entry.value.castToAny().preprocess(attrs[i]) + entry.key to entry.value.castToAny().preprocessOrCast(attrs[i]) } } }, - columnBuilder = { name, values, _, _ -> + columnBuilder = { name, values, _ -> (values as List>) .toDataFrame() .asColumnGroup(name) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt index d5c0a0e5b2..9eccf9bb1f 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/H2.kt @@ -2,6 +2,7 @@ package org.jetbrains.kotlinx.dataframe.io.db import java.sql.ResultSet import java.util.Locale +import kotlin.reflect.KType import org.jetbrains.kotlinx.dataframe.io.db.MariaDb as MariaDbType import org.jetbrains.kotlinx.dataframe.io.db.MsSql as MsSqlType import org.jetbrains.kotlinx.dataframe.io.db.MySql as MySqlType @@ -117,9 +118,9 @@ public open class H2(public val mode: Mode = Mode.Regular) : DbType("h2") { override val driverClassName: String get() = "org.h2.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation = - delegate?.generateTypeInformation(tableColumnMetadata) - ?: super.generateTypeInformation(tableColumnMetadata) + override fun getExpectedJdbcType(tableColumnMetadata: TableColumnMetadata): KType = + delegate?.getExpectedJdbcType(tableColumnMetadata) + ?: super.getExpectedJdbcType(tableColumnMetadata) override fun isSystemTable(tableMetadata: TableMetadata): Boolean { val locale = Locale.getDefault() diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/JdbcTypeMapping.kt similarity index 65% rename from dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt rename to dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/JdbcTypeMapping.kt index 6791e7666f..e16e47a4d3 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/TypeInformation.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/JdbcTypeMapping.kt @@ -1,18 +1,12 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.AnyFrame -import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataColumn -import org.jetbrains.kotlinx.dataframe.api.Infer -import org.jetbrains.kotlinx.dataframe.api.asDataColumn -import org.jetbrains.kotlinx.dataframe.api.cast -import org.jetbrains.kotlinx.dataframe.api.toDataFrame import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import kotlin.reflect.KType import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf -public typealias AnyTypeInformation = TypeInformation<*, *, *> +public typealias AnyJdbcTypeMapping = JdbcTypeMapping<*, *, *> /** * Represents all type information that can be retrieved from an SQL column. @@ -32,73 +26,74 @@ public typealias AnyTypeInformation = TypeInformation<*, *, *> * @property columnBuilder an optional function that converts a [List] with values of type [D] * to a [DataColumn] of with values of type [P]. */ -public open class TypeInformation( - public open val jdbcSourceType: KType, +public open class JdbcTypeMapping( + public open val expectedJdbcType: KType, + public open val preprocessedValueType: KType, public open val targetSchema: ColumnSchema, public open val valuePreprocessor: DbValuePreprocessor?, public open val columnBuilder: DbColumnBuilder?, ) { - public open fun preprocess(value: J?): D? { + public open fun preprocessOrCast(value: J?): D? { valuePreprocessor?.let { valuePreprocessor -> - return valuePreprocessor.preprocess(value, this) + return valuePreprocessor.preprocess(value) } return value as D? } - public open fun buildDataColumn(name: String, values: List, inferNullability: Boolean): DataColumn { + public open fun buildDataColumnOrNull(name: String, values: List, inferNullability: Boolean): DataColumn? { columnBuilder?.let { columnPostprocessor -> - return columnPostprocessor.buildDataColumn(name, values, this, inferNullability) - } - return when (val schema = targetSchema) { - is ColumnSchema.Value -> - DataColumn.createValueColumn( - name = name, - values = values, - infer = if (inferNullability) Infer.Nulls else Infer.None, - type = schema.type, - ).cast() - - // TODO, this case should be avoided. - // Creating `n` DataRows is heavy! - is ColumnSchema.Group -> - DataColumn.createColumnGroup( - name = name, - df = (values as List).toDataFrame(), - ).asDataColumn().cast() - - is ColumnSchema.Frame -> - DataColumn.createFrameColumn( - name = name, - groups = values as List, - schema = lazy { schema.schema }, - ).cast() + return columnPostprocessor.buildDataColumn(name, values, inferNullability) } + return null } } -public fun TypeInformation<*, *, *>.cast(): TypeInformation = - this as TypeInformation +public fun JdbcTypeMapping<*, *, *>.cast(): JdbcTypeMapping = + this as JdbcTypeMapping -public fun TypeInformation<*, *, *>.castToAny(): TypeInformation = cast() +public fun JdbcTypeMapping<*, *, *>.castToAny(): JdbcTypeMapping = cast() // region generic constructors public fun typeInformationWithProcessingFor( jdbcSourceType: KType, + preprocessedValueType: KType, // = jdbcSourceType + targetSchema: ColumnSchema, // = ColumnSchema.Value(preprocessedValueType) + valuePreprocessor: DbValuePreprocessor?, + columnBuilder: DbColumnBuilder?, +): JdbcTypeMapping = + JdbcTypeMapping( + expectedJdbcType = jdbcSourceType, + preprocessedValueType = preprocessedValueType, + targetSchema = targetSchema, + valuePreprocessor = valuePreprocessor, + columnBuilder = columnBuilder, + ) + +public inline fun typeInformationWithProcessingFor( + isNullable: Boolean, + jdbcSourceType: KType = typeOf().withNullability(isNullable), + preprocessedValueType: KType = typeOf().withNullability(isNullable), targetSchema: ColumnSchema, valuePreprocessor: DbValuePreprocessor?, columnBuilder: DbColumnBuilder?, -): TypeInformation = - TypeInformation( +): JdbcTypeMapping = + typeInformationWithProcessingFor( jdbcSourceType = jdbcSourceType, + preprocessedValueType = preprocessedValueType, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, columnBuilder = columnBuilder, ) -public fun typeInformationFor(jdbcSourceType: KType, targetSchema: ColumnSchema): TypeInformation = +public fun typeInformationFor( + jdbcSourceType: KType, + preprocessedValueType: KType, + targetSchema: ColumnSchema, +): JdbcTypeMapping = typeInformationWithProcessingFor( jdbcSourceType = jdbcSourceType, + preprocessedValueType = preprocessedValueType, targetSchema = targetSchema, valuePreprocessor = null, columnBuilder = null, @@ -106,11 +101,13 @@ public fun typeInformationFor(jdbcSourceType: KType, targetSchema: Col public fun typeInformationWithPreprocessingFor( jdbcSourceType: KType, + preprocessedValueType: KType, targetSchema: ColumnSchema, valuePreprocessor: DbValuePreprocessor?, -): TypeInformation = +): JdbcTypeMapping = typeInformationWithProcessingFor( jdbcSourceType = jdbcSourceType, + preprocessedValueType = preprocessedValueType, targetSchema = targetSchema, valuePreprocessor = valuePreprocessor, columnBuilder = null, @@ -120,9 +117,10 @@ public fun typeInformationWithPostprocessingFor( jdbcSourceType: KType, targetSchema: ColumnSchema, columnBuilder: DbColumnBuilder?, -): TypeInformation = +): JdbcTypeMapping = typeInformationWithProcessingFor( jdbcSourceType = jdbcSourceType, + preprocessedValueType = jdbcSourceType, targetSchema = targetSchema, valuePreprocessor = null, columnBuilder = columnBuilder, @@ -132,30 +130,38 @@ public fun typeInformationWithPostprocessingFor( // region ValueColumn constructors -public fun typeInformationForValueColumnOf(kType: KType): TypeInformation = - typeInformationFor(jdbcSourceType = kType, targetSchema = ColumnSchema.Value(kType)) +public fun typeInformationForValueColumnOf(kType: KType): JdbcTypeMapping = + typeInformationFor(jdbcSourceType = kType, preprocessedValueType = kType, targetSchema = ColumnSchema.Value(kType)) -public inline fun typeInformationForValueColumnOf(isNullable: Boolean): TypeInformation = - typeInformationForValueColumnOf(typeOf().withNullability(isNullable)) +public inline fun typeInformationForValueColumnOf( + isNullable: Boolean, + targetColumnType: KType = typeOf().withNullability(isNullable), +): JdbcTypeMapping = typeInformationForValueColumnOf(targetColumnType) public fun typeInformationWithPreprocessingForValueColumnOf( jdbcSourceType: KType, + preprocessedValueType: KType, targetColumnType: KType, valuePreprocessor: DbValuePreprocessor?, -): TypeInformation = +): JdbcTypeMapping = typeInformationWithPreprocessingFor( jdbcSourceType = jdbcSourceType, + preprocessedValueType = preprocessedValueType, targetSchema = ColumnSchema.Value(targetColumnType), valuePreprocessor = valuePreprocessor, ) public inline fun typeInformationWithPreprocessingForValueColumnOf( isNullable: Boolean, + jdbcSourceType: KType = typeOf().withNullability(isNullable), + preprocessedValueType: KType = typeOf().withNullability(isNullable), + targetColumnType: KType = preprocessedValueType, valuePreprocessor: DbValuePreprocessor?, -): TypeInformation = +): JdbcTypeMapping = typeInformationWithPreprocessingForValueColumnOf( - jdbcSourceType = typeOf().withNullability(isNullable), - targetColumnType = typeOf().withNullability(isNullable), + jdbcSourceType = jdbcSourceType, + preprocessedValueType = preprocessedValueType, + targetColumnType = targetColumnType, valuePreprocessor = valuePreprocessor, ) @@ -163,7 +169,7 @@ public fun typeInformationWithPostprocessingForValueColumnOf( jdbcSourceType: KType, targetColumnType: KType, columnPostprocessor: DbColumnBuilder?, -): TypeInformation = +): JdbcTypeMapping = typeInformationWithPostprocessingFor( jdbcSourceType = jdbcSourceType, targetSchema = ColumnSchema.Value(targetColumnType), @@ -172,11 +178,13 @@ public fun typeInformationWithPostprocessingForValueColumnOf( public inline fun typeInformationWithPostprocessingForValueColumnOf( isNullable: Boolean, + jdbcSourceType: KType = typeOf().withNullability(isNullable), + targetColumnType: KType = typeOf

().withNullability(isNullable), columnPostprocessor: DbColumnBuilder?, -): TypeInformation = +): JdbcTypeMapping = typeInformationWithPostprocessingForValueColumnOf( - jdbcSourceType = typeOf().withNullability(isNullable), - targetColumnType = typeOf

().withNullability(isNullable), + jdbcSourceType = jdbcSourceType, + targetColumnType = targetColumnType, columnPostprocessor = columnPostprocessor, ) @@ -200,7 +208,7 @@ public fun interface DbValuePreprocessor { * If you intend to create a [org.jetbrains.kotlinx.dataframe.columns.FrameColumn], * return a [org.jetbrains.kotlinx.dataframe.DataFrame] here. */ - public fun preprocess(jdbcValue: J?, typeInformation: TypeInformation<@UnsafeVariance J, @UnsafeVariance D, *>): D? + public fun preprocess(jdbcValue: J?): D? } public fun DbValuePreprocessor<*, *>.cast(): DbValuePreprocessor = @@ -217,12 +225,7 @@ public fun interface DbColumnBuilder { /** * Converts the given [values]: [DataColumn] with values of type [D] to a [DataColumn] of with values of type [P]. */ - public fun buildDataColumn( - name: String, - values: List, - typeInformation: TypeInformation<*, @UnsafeVariance D, @UnsafeVariance P>, - inferNullability: Boolean, - ): DataColumn + public fun buildDataColumn(name: String, values: List, inferNullability: Boolean): DataColumn } public fun DbColumnBuilder<*, *>.cast(): DbColumnBuilder = this as DbColumnBuilder diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt index 191dadb57a..9060004e62 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MariaDb.kt @@ -1,7 +1,7 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.ResultSet +import kotlin.reflect.KType import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf @@ -15,7 +15,7 @@ public object MariaDb : DbType("mariadb") { override val driverClassName: String get() = "org.mariadb.jdbc.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { + override fun getExpectedJdbcType(tableColumnMetadata: TableColumnMetadata): KType { // Force BIGINT to always be Long, regardless of javaClassName // MariaDB JDBC driver may report Integer for small BIGINT values // TODO: investigate the corner case @@ -28,13 +28,13 @@ public object MariaDb : DbType("mariadb") { if (tableColumnMetadata.sqlTypeName == "INTEGER UNSIGNED" || tableColumnMetadata.sqlTypeName == "INT UNSIGNED" ) { - return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) + return typeOf().withNullability(tableColumnMetadata.isNullable) } if (tableColumnMetadata.sqlTypeName == "SMALLINT" && tableColumnMetadata.javaClassName == "java.lang.Short") { - return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) + return typeOf().withNullability(tableColumnMetadata.isNullable) } - return super.generateTypeInformation(tableColumnMetadata) + return super.getExpectedJdbcType(tableColumnMetadata) } override fun isSystemTable(tableMetadata: TableMetadata): Boolean = MySql.isSystemTable(tableMetadata) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt index c5cab02de1..7c4f74486c 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/MySql.kt @@ -1,8 +1,8 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.ResultSet import java.util.Locale +import kotlin.reflect.KType import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf @@ -16,11 +16,11 @@ public object MySql : DbType("mysql") { override val driverClassName: String get() = "com.mysql.jdbc.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { + override fun getExpectedJdbcType(tableColumnMetadata: TableColumnMetadata): KType { if (tableColumnMetadata.sqlTypeName == "INT UNSIGNED") { - return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) + return typeOf().withNullability(tableColumnMetadata.isNullable) } - return super.generateTypeInformation(tableColumnMetadata) + return super.getExpectedJdbcType(tableColumnMetadata) } override fun isSystemTable(tableMetadata: TableMetadata): Boolean { diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt index edff12478b..1182860067 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt @@ -1,8 +1,8 @@ package org.jetbrains.kotlinx.dataframe.io.db -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.ResultSet import java.util.Locale +import kotlin.reflect.KType import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf @@ -16,12 +16,12 @@ public object PostgreSql : DbType("postgresql") { override val driverClassName: String get() = "org.postgresql.Driver" - override fun generateTypeInformation(tableColumnMetadata: TableColumnMetadata): AnyTypeInformation { + override fun getExpectedJdbcType(tableColumnMetadata: TableColumnMetadata): KType { // because of https://github.com/pgjdbc/pgjdbc/issues/425 if (tableColumnMetadata.sqlTypeName == "money") { - return typeInformationForValueColumnOf(tableColumnMetadata.isNullable) + return typeOf().withNullability(tableColumnMetadata.isNullable) } - return super.generateTypeInformation(tableColumnMetadata) + return super.getExpectedJdbcType(tableColumnMetadata) } override fun isSystemTable(tableMetadata: TableMetadata): Boolean = diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt index 8944631656..8b07bf364c 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readDataFrameSchema.kt @@ -4,15 +4,12 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl import org.jetbrains.kotlinx.dataframe.io.db.DbType -import org.jetbrains.kotlinx.dataframe.io.db.TableColumnMetadata import org.jetbrains.kotlinx.dataframe.io.db.extractDBTypeFromConnection -import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.sql.Connection import java.sql.DriverManager import java.sql.ResultSet import javax.sql.DataSource -import kotlin.use /** * Retrieves the schema for an SQL table using the provided database configuration. @@ -98,6 +95,8 @@ public fun DataFrameSchema.Companion.readSqlTable( ): DataFrameSchema { val determinedDbType = dbType ?: extractDBTypeFromConnection(connection) + // TODO don't need to read 1 row, take it just from TableColumnMetadatas + // Read just 1 row to get the schema val singleRowDataFrame = DataFrame.readSqlTable( connection = connection, @@ -322,7 +321,10 @@ public fun Connection.readDataFrameSchema(sqlQueryOrTableName: String, dbType: D */ public fun DataFrameSchema.Companion.readResultSet(resultSet: ResultSet, dbType: DbType): DataFrameSchema { val tableColumns = getTableColumnsMetadata(resultSet, dbType) - return buildSchemaByTableColumns(tableColumns, dbType) + val expectedJdbcTypes = getExpectedJdbcTypes(dbType, tableColumns) + val preprocessedValueTypes = getPreprocessedValueTypes(dbType, tableColumns, expectedJdbcTypes) + val targetColumnSchemas = getTargetColumnSchemas(dbType, tableColumns, preprocessedValueTypes) + return DataFrameSchemaImpl(targetColumnSchemas) } /** @@ -437,21 +439,3 @@ public fun DataFrameSchema.Companion.readAllSqlTables( return dataFrameSchemas } - -/** - * Builds a DataFrame schema based on the given table columns. - * - * @param [tableColumns] a mutable map containing the table columns, where the key represents the column name - * and the value represents the metadata of the column - * @param [dbType] the type of database. - * @return a [DataFrameSchema] object representing the schema built from the table columns. - */ -internal fun buildSchemaByTableColumns( - tableColumns: MutableList, - dbType: DbType, -): DataFrameSchema { - val schemaColumns = tableColumns.associate { - it.name to dbType.getOrGenerateTypeInformation(it).targetSchema - } - return DataFrameSchemaImpl(columns = schemaColumns) -} diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt index 35aeb33c48..ddbabb7e6d 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readJdbc.kt @@ -9,11 +9,8 @@ import org.jetbrains.kotlinx.dataframe.api.isFrameColumn import org.jetbrains.kotlinx.dataframe.api.isValueColumn import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.toDataFrame -import org.jetbrains.kotlinx.dataframe.io.db.AnyTypeInformation import org.jetbrains.kotlinx.dataframe.io.db.DbType import org.jetbrains.kotlinx.dataframe.io.db.TableColumnMetadata -import org.jetbrains.kotlinx.dataframe.io.db.cast -import org.jetbrains.kotlinx.dataframe.io.db.castToAny import org.jetbrains.kotlinx.dataframe.io.db.extractDBTypeFromConnection import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import java.sql.Connection @@ -23,6 +20,7 @@ import java.sql.PreparedStatement import java.sql.ResultSet import javax.sql.DataSource import kotlin.reflect.KClass +import kotlin.reflect.KType import kotlin.reflect.full.isSubclassOf private val logger = KotlinLogging.logger {} @@ -191,7 +189,7 @@ private fun executeQueryAndBuildDataFrame( logger.debug { "Executing query: $sqlQuery" } statement.executeQuery().use { rs -> val tableColumns = getTableColumnsMetadata(rs, determinedDbType) - fetchAndConvertDataFromResultSet(tableColumns, rs, determinedDbType, limit, inferNullability) + fetchAndConvertDataFromResultSet(determinedDbType, tableColumns, rs, limit, inferNullability) } } } catch (e: java.sql.SQLException) { @@ -572,7 +570,7 @@ public fun DataFrame.Companion.readResultSet( ): AnyFrame { validateLimit(limit) val tableColumns = getTableColumnsMetadata(resultSet, dbType) - return fetchAndConvertDataFromResultSet(tableColumns, resultSet, dbType, limit, inferNullability) + return fetchAndConvertDataFromResultSet(dbType, tableColumns, resultSet, limit, inferNullability) } /** @@ -861,8 +859,8 @@ private fun readTableAsDataFrame( return dataFrame } -internal fun getTableColumnsMetadata(resultSet: ResultSet, dbType: DbType): MutableList = - dbType.getTableColumnsMetadata(resultSet).toMutableList() +internal fun getTableColumnsMetadata(resultSet: ResultSet, dbType: DbType): List = + dbType.getTableColumnsMetadata(resultSet) /** * Fetches and converts data from a ResultSet into a mutable map. @@ -876,25 +874,41 @@ internal fun getTableColumnsMetadata(resultSet: ResultSet, dbType: DbType): Muta * @return A mutable map containing the fetched and converted data. */ internal fun fetchAndConvertDataFromResultSet( - tableColumns: MutableList, - rs: ResultSet, dbType: DbType, + tableColumns: List, + rs: ResultSet, limit: Int?, inferNullability: Boolean, ): AnyFrame { - val columnTypeInformation = buildColumnTypeInformation(tableColumns = tableColumns, dbType = dbType) - val columnData = readAllRowsFromResultSet( + val expectedJdbcTypes = getExpectedJdbcTypes( + dbType = dbType, + tableColumns = tableColumns, + ) + val preprocessedValueTypes = getPreprocessedValueTypes( + dbType = dbType, + tableColumns = tableColumns, + expectedJdbcTypes = expectedJdbcTypes, + ) + val targetColumnSchemas = getTargetColumnSchemas( + dbType = dbType, + tableColumns = tableColumns, + preprocessedValueTypes = preprocessedValueTypes, + ) + + val columnData = readAndPreprocessRowsFromResultSet( rs = rs, tableColumns = tableColumns, - columnTypeInformation = columnTypeInformation, + expectedJdbcTypes = expectedJdbcTypes, + preprocessedValueTypes = preprocessedValueTypes, dbType = dbType, limit = limit, ) + val dataFrame = buildDataFrameFromColumnData( + dbType = dbType, columnData = columnData, tableColumns = tableColumns, - columnTypeInformation = columnTypeInformation, - dbType = dbType, + targetColumnSchemas = targetColumnSchemas, inferNullability = inferNullability, ) @@ -905,41 +919,69 @@ internal fun fetchAndConvertDataFromResultSet( return dataFrame } -/** - * Builds a map of column indices to their Kotlin types. - */ -private fun buildColumnTypeInformation( +internal fun getExpectedJdbcTypes(dbType: DbType, tableColumns: List): Map = + tableColumns.associate { + it.name to dbType.getExpectedJdbcType(tableColumnMetadata = it) + } + +internal fun getPreprocessedValueTypes( + dbType: DbType, tableColumns: List, + expectedJdbcTypes: Map, +): Map = + tableColumns.associate { + it.name to dbType.getPreprocessedValueType( + tableColumnMetadata = it, + expectedJdbcType = expectedJdbcTypes[it.name]!!, + ) + } + +internal fun getTargetColumnSchemas( dbType: DbType, -): List = - tableColumns.indices.map { index -> - dbType.getOrGenerateTypeInformation(tableColumns[index]) + tableColumns: List, + preprocessedValueTypes: Map, +): Map = + tableColumns.associate { + it.name to dbType.getTargetColumnSchema( + tableColumnMetadata = it, + expectedValueType = preprocessedValueTypes[it.name]!!, + ) } /** * Reads all rows from ResultSet and returns a column-oriented data structure. */ -private fun readAllRowsFromResultSet( +private fun readAndPreprocessRowsFromResultSet( + dbType: DbType, rs: ResultSet, tableColumns: List, - columnTypeInformation: List, - dbType: DbType, + expectedJdbcTypes: Map, + preprocessedValueTypes: Map, limit: Int?, -): List> { - val columnsCount = tableColumns.size - val columnData = List(columnsCount) { mutableListOf() } +): Map> { + val columnNames = tableColumns.map { it.name } + val columnData = columnNames.associateWith { mutableListOf() } var rowsRead = 0 while (rs.next() && (limit == null || rowsRead < limit)) { - repeat(columnsCount) { columnIndex -> - val typeInformation = columnTypeInformation[columnIndex].castToAny() - val value = dbType.getValueFromResultSet( + columnNames.forEachIndexed { i, name -> + val tableColumnMetadata = tableColumns[i] + val expectedJdbcType = expectedJdbcTypes[name]!! + val preprocessedValueType = preprocessedValueTypes[name]!! + + val value = dbType.getValueFromResultSet( rs = rs, - columnIndex = columnIndex, - typeInformation = typeInformation, + columnIndex = i, + tableColumnMetadata = tableColumnMetadata, + expectedJdbcType = expectedJdbcType, + ) + val preprocessedValue = dbType.preprocessValue( + value = value, + tableColumnMetadata = tableColumnMetadata, + expectedJdbcType = expectedJdbcType, + expectedPreprocessedValueType = preprocessedValueType, ) - val preprocessedValue = dbType.preprocessValuesFromResultSet(value, typeInformation) - columnData[columnIndex].add(preprocessedValue) + columnData[name]!!.add(preprocessedValue) } rowsRead++ // if (rowsRead % 1000 == 0) logger.debug { "Loaded $rowsRead rows." } // TODO: https://github.com/Kotlin/dataframe/issues/455 @@ -953,24 +995,25 @@ private fun readAllRowsFromResultSet( * Accepts mutable lists to enable efficient in-place transformations. */ private fun buildDataFrameFromColumnData( - columnData: List>, - tableColumns: List, - columnTypeInformation: List, dbType: DbType, + columnData: Map>, + tableColumns: List, + targetColumnSchemas: Map, inferNullability: Boolean, checkSchema: Boolean = true, // TODO add as configurable parameter ): AnyFrame = - columnData.mapIndexed { index, values -> - val typeInformation = columnTypeInformation[index].castToAny() - val column = dbType.buildDataColumn( - name = tableColumns[index].name, - values = values, - typeInformation = typeInformation, + tableColumns.map { + val name = it.name + val column = dbType.buildDataColumn( + name = name, + values = columnData[name]!!, + tableColumnMetadata = it, + targetColumnSchema = targetColumnSchemas[name]!!, inferNullability = inferNullability, ) if (checkSchema) { - column.checkSchema(typeInformation.targetSchema) + column.checkSchema(targetColumnSchemas[name]!!) } column diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt index 085d519220..89bf3e9bd1 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt @@ -260,8 +260,10 @@ class DuckDbTest { @DataSchema data class NestedTypes( + @ColumnName("test_col") + val testCol: Int, @ColumnName("ijstruct_col") - val ijstructCol: NestedEntry, // TODO + val ijstructCol: NestedEntry, @ColumnName("intarray_col") val intarrayCol: List, @ColumnName("intlist_col") @@ -600,6 +602,7 @@ class DuckDbTest { connection.prepareStatement( """ CREATE TABLE IF NOT EXISTS table2 ( + test_col INTEGER, intarray_col INTEGER[3], stringarray_col VARCHAR[3], intlist_col INTEGER[], @@ -616,6 +619,7 @@ class DuckDbTest { connection.prepareStatement( """ INSERT INTO table2 VALUES ( + 1, -- int array_value(1, 2, NULL), -- int array array_value('a', 'ab', 'abc'), -- string array list_value(1, 2, 3), -- int list From d6780327a308ac61bffacd5c725cd8985d16450a Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 17 Dec 2025 11:38:56 +0100 Subject: [PATCH 12/14] added resultSetReader option to JdbcToDataFrameConverter, converting some types by default --- .../kotlinx/dataframe/io/db/AdvancedDbType.kt | 39 ++++-- .../kotlinx/dataframe/io/db/DbType.kt | 93 +++++++++----- .../kotlinx/dataframe/io/db/DuckDb.kt | 62 ++++----- ...Mapping.kt => JdbcToDataFrameConverter.kt} | 119 ++++++++++++------ .../kotlinx/dataframe/io/h2/mariadbH2Test.kt | 14 ++- .../kotlinx/dataframe/io/h2/mssqlH2Test.kt | 9 +- .../kotlinx/dataframe/io/h2/mysqlH2Test.kt | 14 ++- 7 files changed, 224 insertions(+), 126 deletions(-) rename dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/{JdbcTypeMapping.kt => JdbcToDataFrameConverter.kt} (64%) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt index acba290819..31c824b725 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/AdvancedDbType.kt @@ -2,45 +2,62 @@ package org.jetbrains.kotlinx.dataframe.io.db import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema +import java.sql.ResultSet import kotlin.reflect.KType /** * Alternative version of [DbType] that allows to customize type mapping - * by initializing a [JdbcTypeMapping] instance for each JDBC type. + * by initializing a [JdbcToDataFrameConverter] instance for each JDBC type. * * This can be helpful for JDBC databases that support structured data, like [DuckDb] * or that need to a lot of type mapping. */ public abstract class AdvancedDbType(dbTypeInJdbcUrl: String) : DbType(dbTypeInJdbcUrl) { - protected abstract fun generateTypeMapping(tableColumnMetadata: TableColumnMetadata): AnyJdbcTypeMapping + protected abstract fun generateConverter(tableColumnMetadata: TableColumnMetadata): AnyJdbcToDataFrameConverter - private val typeMappingCache = mutableMapOf() + private val converterCache = mutableMapOf() - protected fun getTypeMapping(tableColumnMetadata: TableColumnMetadata): AnyJdbcTypeMapping = - typeMappingCache.getOrPut(tableColumnMetadata) { - generateTypeMapping(tableColumnMetadata) + protected fun getConverter(tableColumnMetadata: TableColumnMetadata): AnyJdbcToDataFrameConverter = + converterCache.getOrPut(tableColumnMetadata) { + generateConverter(tableColumnMetadata) } final override fun getExpectedJdbcType(tableColumnMetadata: TableColumnMetadata): KType = - getTypeMapping(tableColumnMetadata).expectedJdbcType + getConverter(tableColumnMetadata).expectedJdbcType final override fun getPreprocessedValueType( tableColumnMetadata: TableColumnMetadata, expectedJdbcType: KType, - ): KType = getTypeMapping(tableColumnMetadata).preprocessedValueType + ): KType = getConverter(tableColumnMetadata).preprocessedValueType final override fun getTargetColumnSchema( tableColumnMetadata: TableColumnMetadata, expectedValueType: KType, - ): ColumnSchema = getTypeMapping(tableColumnMetadata).targetSchema + ): ColumnSchema = getConverter(tableColumnMetadata).targetSchema + + final override fun getValueFromResultSet( + rs: ResultSet, + columnIndex: Int, + tableColumnMetadata: TableColumnMetadata, + expectedJdbcType: KType, + ): J? = + getConverter(tableColumnMetadata).cast() + .getValueFromResultSetOrElse(rs, columnIndex) { + try { + rs.getObject(columnIndex + 1) + } catch (_: Throwable) { + // TODO? + rs.getString(columnIndex + 1) + } as J? + } final override fun preprocessValue( value: J?, tableColumnMetadata: TableColumnMetadata, expectedJdbcType: KType, expectedPreprocessedValueType: KType, - ): D? = getTypeMapping(tableColumnMetadata).cast().preprocessOrCast(value) + ): D? = getConverter(tableColumnMetadata).cast().preprocessOrCast(value) final override fun buildDataColumn( name: String, @@ -49,7 +66,7 @@ public abstract class AdvancedDbType(dbTypeInJdbcUrl: String) : DbType(dbTypeInJ targetColumnSchema: ColumnSchema, inferNullability: Boolean, ): DataColumn = - getTypeMapping(tableColumnMetadata).cast() + getConverter(tableColumnMetadata).cast() .buildDataColumnOrNull(name, values, inferNullability) ?: values.toDataColumn( name = name, diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt index 19fbd7ccf7..3da18fe5dd 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DbType.kt @@ -1,5 +1,7 @@ package org.jetbrains.kotlinx.dataframe.io.db +import kotlinx.datetime.LocalDateTime +import kotlinx.datetime.toKotlinLocalDateTime import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataColumn @@ -27,10 +29,8 @@ import java.sql.SQLXML import java.sql.Time import java.sql.Timestamp import java.sql.Types -import java.time.LocalDateTime import java.time.OffsetDateTime import java.time.OffsetTime -import java.util.Date import java.util.UUID import kotlin.collections.toTypedArray import kotlin.reflect.KClass @@ -38,6 +38,12 @@ import kotlin.reflect.KType import kotlin.reflect.full.safeCast import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf +import kotlin.time.Instant +import kotlin.time.toKotlinInstant +import kotlin.uuid.Uuid +import kotlin.uuid.toKotlinUuid +import java.time.LocalDateTime as JavaLocalDateTime +import java.util.Date as JavaDate /** * The `DbType` class represents a database type used for reading dataframe from the database. @@ -104,7 +110,7 @@ public abstract class DbType(public val dbTypeInJdbcUrl: String) { Types.CHAR to typeOf(), Types.VARCHAR to typeOf(), Types.LONGVARCHAR to typeOf(), - Types.DATE to typeOf(), + Types.DATE to typeOf(), Types.TIME to typeOf

().withNullability(isNullable), + resultSetReader: DbResultSetReader? = null, columnPostprocessor: DbColumnBuilder?, -): JdbcTypeMapping = - typeInformationWithPostprocessingForValueColumnOf( +): JdbcToDataFrameConverter = + jdbcToDfConverterWithPostprocessingForValueColumnOf( jdbcSourceType = jdbcSourceType, targetColumnType = targetColumnType, + resultSetReader = resultSetReader, columnPostprocessor = columnPostprocessor, ) // endregion +public fun interface DbResultSetReader { + + public fun getValue(rs: ResultSet, columnIndex: Int): J? +} + /** * This preprocessor can be created for types where you want to convert the values * coming from [java.sql.ResultSet.getObject] to a different type more suitable to be put in a [DataColumn] @@ -217,8 +258,8 @@ public fun DbValuePreprocessor<*, *>.cast(): DbValuePreproces public fun DbValuePreprocessor<*, *>.castToAny(): DbValuePreprocessor = cast() /** - * @param D the type of the column values before postprocessing. - * @param P the type of the column values after postprocessing. + * @param D the type of the column values before entering the column. + * @param P the type of the column values after entering the column. */ public fun interface DbColumnBuilder { diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mariadbH2Test.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mariadbH2Test.kt index 0707000ef6..b4c99468f5 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mariadbH2Test.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mariadbH2Test.kt @@ -21,8 +21,10 @@ import java.sql.Blob import java.sql.Connection import java.sql.DriverManager import java.sql.SQLException +import java.sql.Timestamp import java.util.Date import kotlin.reflect.typeOf +import kotlin.time.Instant private const val URL = "jdbc:h2:mem:test1;DB_CLOSE_DELAY=-1;MODE=MariaDB;DATABASE_TO_LOWER=TRUE" @@ -230,8 +232,8 @@ class MariadbH2Test { st.setDouble(11, i * 10.0) st.setBigDecimal(12, BigDecimal(i * 10)) st.setDate(13, java.sql.Date(System.currentTimeMillis())) - st.setTimestamp(14, java.sql.Timestamp(System.currentTimeMillis())) - st.setTimestamp(15, java.sql.Timestamp(System.currentTimeMillis())) + st.setTimestamp(14, Timestamp(System.currentTimeMillis())) + st.setTimestamp(15, Timestamp(System.currentTimeMillis())) st.setTime(16, java.sql.Time(System.currentTimeMillis())) st.setInt(17, 2023) st.setString(18, "varcharValue$i") @@ -268,8 +270,8 @@ class MariadbH2Test { st.setDouble(11, i * 20.0) st.setBigDecimal(12, BigDecimal(i * 20)) st.setDate(13, java.sql.Date(System.currentTimeMillis())) - st.setTimestamp(14, java.sql.Timestamp(System.currentTimeMillis())) - st.setTimestamp(15, java.sql.Timestamp(System.currentTimeMillis())) + st.setTimestamp(14, Timestamp(System.currentTimeMillis())) + st.setTimestamp(15, Timestamp(System.currentTimeMillis())) st.setTime(16, java.sql.Time(System.currentTimeMillis())) st.setInt(17, 2023) st.setString(18, "varcharValue$i") @@ -316,8 +318,8 @@ class MariadbH2Test { schema.columns["longblobcol"]!!.type shouldBe typeOf() schema.columns["tinyblobcol"]!!.type shouldBe typeOf() schema.columns["datecol"]!!.type shouldBe typeOf() - schema.columns["datetimecol"]!!.type shouldBe typeOf() - schema.columns["timestampcol"]!!.type shouldBe typeOf() + schema.columns["datetimecol"]!!.type shouldBe typeOf() + schema.columns["timestampcol"]!!.type shouldBe typeOf() schema.columns["timecol"]!!.type shouldBe typeOf() schema.columns["yearcol"]!!.type shouldBe typeOf() diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mssqlH2Test.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mssqlH2Test.kt index 3d37005b85..f625289fa4 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mssqlH2Test.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mssqlH2Test.kt @@ -21,6 +21,7 @@ import java.sql.SQLException import java.util.Date import java.util.UUID import kotlin.reflect.typeOf +import kotlin.time.Instant private const val URL = "jdbc:h2:mem:testmssql;DB_CLOSE_DELAY=-1;MODE=MSSQLServer;DATABASE_TO_UPPER=FALSE;CASE_INSENSITIVE_IDENTIFIERS=TRUE" @@ -188,8 +189,8 @@ class MSSQLH2Test { schema.columns["bitColumn"]!!.type shouldBe typeOf() schema.columns["charColumn"]!!.type shouldBe typeOf() schema.columns["dateColumn"]!!.type shouldBe typeOf() - schema.columns["datetime3Column"]!!.type shouldBe typeOf() - schema.columns["datetime2Column"]!!.type shouldBe typeOf() + schema.columns["datetime3Column"]!!.type shouldBe typeOf() + schema.columns["datetime2Column"]!!.type shouldBe typeOf() schema.columns["decimalColumn"]!!.type shouldBe typeOf() schema.columns["floatColumn"]!!.type shouldBe typeOf() schema.columns["intColumn"]!!.type shouldBe typeOf() @@ -200,11 +201,11 @@ class MSSQLH2Test { schema.columns["nvarcharColumn"]!!.type shouldBe typeOf() schema.columns["nvarcharMaxColumn"]!!.type shouldBe typeOf() schema.columns["realColumn"]!!.type shouldBe typeOf() - schema.columns["smalldatetimeColumn"]!!.type shouldBe typeOf() + schema.columns["smalldatetimeColumn"]!!.type shouldBe typeOf() schema.columns["smallintColumn"]!!.type shouldBe typeOf() schema.columns["smallmoneyColumn"]!!.type shouldBe typeOf() schema.columns["timeColumn"]!!.type shouldBe typeOf() - schema.columns["timestampColumn"]!!.type shouldBe typeOf() + schema.columns["timestampColumn"]!!.type shouldBe typeOf() schema.columns["tinyintColumn"]!!.type shouldBe typeOf() schema.columns["varbinaryColumn"]!!.type shouldBe typeOf() schema.columns["varbinaryMaxColumn"]!!.type shouldBe typeOf() diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mysqlH2Test.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mysqlH2Test.kt index c0a6cdbe14..86d6ebc5b2 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mysqlH2Test.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/mysqlH2Test.kt @@ -20,8 +20,10 @@ import java.math.BigDecimal import java.sql.Connection import java.sql.DriverManager import java.sql.SQLException +import java.sql.Timestamp import java.util.Date import kotlin.reflect.typeOf +import kotlin.time.Instant // NOTE: the names of testing databases should be different to avoid collisions and should not contain the system names itself private const val URL = "jdbc:h2:mem:test2;DB_CLOSE_DELAY=-1;MODE=MySQL;DATABASE_TO_LOWER=TRUE" @@ -228,8 +230,8 @@ class MySqlH2Test { st.setDouble(11, i * 10.0) st.setBigDecimal(12, BigDecimal(i * 10)) st.setDate(13, java.sql.Date(System.currentTimeMillis())) - st.setTimestamp(14, java.sql.Timestamp(System.currentTimeMillis())) - st.setTimestamp(15, java.sql.Timestamp(System.currentTimeMillis())) + st.setTimestamp(14, Timestamp(System.currentTimeMillis())) + st.setTimestamp(15, Timestamp(System.currentTimeMillis())) st.setTime(16, java.sql.Time(System.currentTimeMillis())) st.setInt(17, 2023) st.setString(18, "varcharValue$i") @@ -265,8 +267,8 @@ class MySqlH2Test { st.setDouble(11, i * 20.0) st.setBigDecimal(12, BigDecimal(i * 20)) st.setDate(13, java.sql.Date(System.currentTimeMillis())) - st.setTimestamp(14, java.sql.Timestamp(System.currentTimeMillis())) - st.setTimestamp(15, java.sql.Timestamp(System.currentTimeMillis())) + st.setTimestamp(14, Timestamp(System.currentTimeMillis())) + st.setTimestamp(15, Timestamp(System.currentTimeMillis())) st.setTime(16, java.sql.Time(System.currentTimeMillis())) st.setInt(17, 2023) st.setString(18, "varcharValue$i") @@ -308,8 +310,8 @@ class MySqlH2Test { schema.columns["id"]!!.type shouldBe typeOf() schema.columns["textcol"]!!.type shouldBe typeOf() schema.columns["datecol"]!!.type shouldBe typeOf() - schema.columns["datetimecol"]!!.type shouldBe typeOf() - schema.columns["timestampcol"]!!.type shouldBe typeOf() + schema.columns["datetimecol"]!!.type shouldBe typeOf() + schema.columns["timestampcol"]!!.type shouldBe typeOf() schema.columns["timecol"]!!.type shouldBe typeOf() schema.columns["yearcol"]!!.type shouldBe typeOf() schema.columns["varbinarycol"]!!.type shouldBe typeOf() From 736fe793d99d80e174d91282807b7a6574322b43 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 17 Dec 2025 11:59:04 +0100 Subject: [PATCH 13/14] exploring struct/composite types for postgresql --- .../kotlinx/dataframe/io/db/PostgreSql.kt | 2 ++ .../kotlinx/dataframe/io/h2/postgresH2Test.kt | 36 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt index 1182860067..581c36d669 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/PostgreSql.kt @@ -21,6 +21,8 @@ public object PostgreSql : DbType("postgresql") { if (tableColumnMetadata.sqlTypeName == "money") { return typeOf().withNullability(tableColumnMetadata.isNullable) } + // TODO: Composite types like tableColumnMetadata.sqlTypeName = ROW("a" INTEGER, "b" CHARACTER VARYING(10)) + return super.getExpectedJdbcType(tableColumnMetadata) } diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/postgresH2Test.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/postgresH2Test.kt index d2b4bef65d..efd1425555 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/postgresH2Test.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/h2/postgresH2Test.kt @@ -123,6 +123,18 @@ class PostgresH2Test { connection.createStatement().execute(createTableQuery.trimIndent()) + // Table with STRUCT/ROW column to verify JDBC Types.STRUCT handling + @Language("SQL") + val createTableWithStruct = + """ + CREATE TABLE IF NOT EXISTS table3 ( + id serial PRIMARY KEY, + structCol ROW(a INT, b VARCHAR(10)) not null + ) + """.trimIndent() + + connection.createStatement().execute(createTableWithStruct) + @Language("SQL") val insertData1 = """ @@ -196,6 +208,17 @@ class PostgresH2Test { st.executeUpdate() } } + + // Insert data into table3 with ROW/STRUCT literals + @Language("SQL") + val insertStructs = + """ + INSERT INTO table3 (structCol) VALUES + (ROW(1, 'X')), + (ROW(2, 'Y')), + (ROW(3, 'Z')) + """.trimIndent() + connection.createStatement().execute(insertStructs) } @AfterClass @@ -286,6 +309,19 @@ class PostgresH2Test { table2Df[0][4] shouldBe 1001 } + @Test + fun `read composite column from table`() { + val tableName3 = "table3" + val df3 = DataFrame.readSqlTable(connection, tableName3) + + // Validate row count + df3.rowsCount() shouldBe 3 + + // Validate schema type stays as Any for STRUCT (no special mapping yet) + val schema3 = DataFrameSchema.readSqlTable(connection, tableName3) + schema3.columns["structcol"]!!.type shouldBe typeOf() + } + @Test fun `read columns of different types to check type mapping`() { val tableName1 = "table1" From be83cc5648aa15477e9a088ede97fc5f8eacd067 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 17 Dec 2025 15:24:19 +0100 Subject: [PATCH 14/14] added duckDb STRUCT[] column to FrameColumn conversion --- .../kotlinx/dataframe/io/db/DuckDb.kt | 50 +++++++++++++------ .../io/db/JdbcToDataFrameConverter.kt | 19 ++++++- .../kotlinx/dataframe/io/local/duckDbTest.kt | 16 +++++- 3 files changed, 67 insertions(+), 18 deletions(-) diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index 99e8c49652..10b14ff4d8 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -42,6 +42,8 @@ import org.duckdb.DuckDBColumnType.UUID import org.duckdb.DuckDBColumnType.VARCHAR import org.duckdb.DuckDBResultSetMetaData import org.duckdb.JsonNode +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.asColumnGroup @@ -63,6 +65,7 @@ import java.util.Properties import kotlin.reflect.KClass import kotlin.reflect.KTypeProjection import kotlin.reflect.full.createType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf import kotlin.time.Instant @@ -192,22 +195,37 @@ public object DuckDb : AdvancedDbType("duckdb") { val parsedListType = parseDuckDbType(listType, true).castToAny() - val targetListType = List::class.createType( - listOf( - KTypeProjection.invariant( - parsedListType.targetSchema.type, - ), - ), - ).withNullability(isNullable) + val targetListType = List::class + .createType(listOf(KTypeProjection.invariant(parsedListType.targetSchema.type))) + .withNullability(isNullable) + + when (val listTargetSchema = parsedListType.targetSchema) { + // convert STRUCT[] -> DataFrame<*> to create a FrameColumn + is ColumnSchema.Group if parsedListType.expectedJdbcType.isSubtypeOf(typeOf()) -> + jdbcToDfConverterWithPreprocessingFor( + isNullable = isNullable, + targetSchema = with(listTargetSchema) { + ColumnSchema.Frame(schema, nullable, contentType) + }, + ) { sqlArray -> + sqlArray + ?.toList() + ?.let { it as List } + ?.mapNotNull { + parsedListType.cast, AnyRow>() + .preprocessOrCast(it) + }?.toDataFrame() + } - // todo maybe List should become FrameColumn - jdbcToDfConverterWithPreprocessingForValueColumnOf>( - isNullable = isNullable, - preprocessedValueType = targetListType, - ) { sqlArray -> - sqlArray - ?.toList() - ?.map { parsedListType.preprocessOrCast(it) } // recursively preprocess + else -> + jdbcToDfConverterWithPreprocessingForValueColumnOf>( + isNullable = isNullable, + preprocessedValueType = targetListType, + ) { sqlArray -> + sqlArray + ?.toList() + ?.map { parsedListType.preprocessOrCast(it) } // recursively preprocess + } } } @@ -222,7 +240,7 @@ public object DuckDb : AdvancedDbType("duckdb") { contentType = typeOf(), ) - jdbcToDfConverterWithProcessingFor, DataRow<*>>( + jdbcToDfConverterWithProcessingFor, AnyRow>( isNullable = isNullable, targetSchema = targetSchema, valuePreprocessor = { struct -> diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/JdbcToDataFrameConverter.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/JdbcToDataFrameConverter.kt index f703d8955f..3ebad6c0ee 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/JdbcToDataFrameConverter.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/JdbcToDataFrameConverter.kt @@ -68,7 +68,7 @@ public fun jdbcToDfConverterWithProcessingFor( jdbcSourceType: KType, preprocessedValueType: KType, // = jdbcSourceType targetSchema: ColumnSchema, // = ColumnSchema.Value(preprocessedValueType) - resultSetReader: DbResultSetReader?, + resultSetReader: DbResultSetReader? = null, valuePreprocessor: DbValuePreprocessor?, columnBuilder: DbColumnBuilder?, ): JdbcToDataFrameConverter = @@ -130,6 +130,23 @@ public fun jdbcToDfConverterWithPreprocessingFor( columnBuilder = null, ) +public inline fun jdbcToDfConverterWithPreprocessingFor( + isNullable: Boolean, + jdbcSourceType: KType = typeOf().withNullability(isNullable), + preprocessedValueType: KType = typeOf().withNullability(isNullable), + targetSchema: ColumnSchema, + resultSetReader: DbResultSetReader? = null, + valuePreprocessor: DbValuePreprocessor?, +): JdbcToDataFrameConverter = + jdbcToDfConverterWithProcessingFor( + jdbcSourceType = jdbcSourceType, + preprocessedValueType = preprocessedValueType, + targetSchema = targetSchema, + resultSetReader = resultSetReader, + valuePreprocessor = valuePreprocessor, + columnBuilder = null, + ) + public fun jdbcToDfConverterWithPostprocessingFor( jdbcSourceType: KType, targetSchema: ColumnSchema, diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt index 89bf3e9bd1..e0e324654a 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt @@ -13,9 +13,11 @@ import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.ColumnName import org.jetbrains.kotlinx.dataframe.annotations.DataSchema +import org.jetbrains.kotlinx.dataframe.api.DataRowSchema import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.colsOf import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf import org.jetbrains.kotlinx.dataframe.api.reorderColumnsByName import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.single @@ -256,7 +258,10 @@ class DuckDbTest { } @DataSchema - data class NestedEntry(val i: Int, val j: String) + data class NestedEntry(val i: Int, val j: String) : DataRowSchema + + @DataSchema + data class NullableNestedEntry(val i: Int?, val j: String?) : DataRowSchema @DataSchema data class NestedTypes( @@ -264,6 +269,8 @@ class DuckDbTest { val testCol: Int, @ColumnName("ijstruct_col") val ijstructCol: NestedEntry, + @ColumnName("ijstructlist_col") + val ijstructlistCol: DataFrame, @ColumnName("intarray_col") val intarrayCol: List, @ColumnName("intlist_col") @@ -611,6 +618,7 @@ class DuckDbTest { intstringmap_col MAP(INTEGER, VARCHAR), intstrinstinggmap_col MAP(INTEGER, MAP(VARCHAR, VARCHAR)), ijstruct_col STRUCT(i INTEGER, j VARCHAR), + ijstructlist_col STRUCT(i INTEGER, j VARCHAR)[], union_col UNION(num INTEGER, text VARCHAR), ) """.trimIndent(), @@ -628,6 +636,7 @@ class DuckDbTest { MAP { 1: 'value1', 200: 'value2' }, -- int string map MAP { 1: MAP { 'value1': 'a', 'value2': 'b' }, 200: MAP { 'value1': 'c', 'value2': 'd' } }, -- int string string map { 'i': 42, 'j': 'answer' }, -- struct + list_value({ 'i': 42, 'j': 'answer' }, { 'i': 44, 'j': 'answer' }), -- struct list union_value(num := 2), -- union ) """.trimIndent(), @@ -655,6 +664,11 @@ class DuckDbTest { ) it[{ "ijstruct_col"["i"]() }] shouldBe 42 it[{ "ijstruct_col"["j"]() }] shouldBe "answer" + it[{ "ijstructlist_col">() }] shouldBe + dataFrameOf( + NestedEntry(42, "answer"), + NestedEntry(44, "answer"), + ) it["union_col"] shouldBe 2 } }