From 3c5797b942991306b05a6eae10dc262c6f6b9fda Mon Sep 17 00:00:00 2001
From: lewismc
Date: Thu, 16 Oct 2025 21:51:56 -0700
Subject: [PATCH] TIKA-4513 Instrument tika-server
---
.gitignore | 3 +-
tika-server/OPENTELEMETRY.md | 658 ++++++++++++++++++
tika-server/tika-server-core/pom.xml | 38 +
.../tika/server/core/TikaOpenTelemetry.java | 342 +++++++++
.../server/core/TikaOpenTelemetryConfig.java | 301 ++++++++
.../tika/server/core/TikaServerConfig.java | 10 +
.../tika/server/core/TikaServerProcess.java | 8 +
.../core/resource/DetectorResource.java | 35 +-
.../core/resource/MetadataResource.java | 46 +-
.../server/core/resource/TikaResource.java | 42 ++
.../resources/tika-server-config-default.xml | 28 +
.../server/core/TikaOpenTelemetryTest.java | 187 +++++
tika-server/tika-server-standard/bin/tika | 17 +-
.../tika-server-standard/docker/README.md | 146 ++++
.../docker/docker-compose-otel.yml | 47 ++
.../tika-server-standard/otel-agent/README.md | 216 ++++++
16 files changed, 2115 insertions(+), 9 deletions(-)
create mode 100644 tika-server/OPENTELEMETRY.md
create mode 100644 tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java
create mode 100644 tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java
create mode 100644 tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java
create mode 100644 tika-server/tika-server-standard/docker/README.md
create mode 100644 tika-server/tika-server-standard/docker/docker-compose-otel.yml
create mode 100644 tika-server/tika-server-standard/otel-agent/README.md
diff --git a/.gitignore b/.gitignore
index 011a1f3a01..c73b59429f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,5 @@ nb-configuration.xml
*.DS_Store
*.tmp-inception
*.snap
-.*.swp
\ No newline at end of file
+.*.swp
+.vscode
diff --git a/tika-server/OPENTELEMETRY.md b/tika-server/OPENTELEMETRY.md
new file mode 100644
index 0000000000..789a10474c
--- /dev/null
+++ b/tika-server/OPENTELEMETRY.md
@@ -0,0 +1,658 @@
+# OpenTelemetry Instrumentation for Apache Tika Server
+
+This document describes how to enable and use OpenTelemetry (OTEL) observability in Apache Tika Server for comprehensive monitoring of traces, metrics, and logs.
+
+## Table of Contents
+
+1. [Introduction](#introduction)
+2. [Prerequisites](#prerequisites)
+3. [Quick Start](#quick-start)
+4. [Configuration](#configuration)
+5. [Auto-Instrumentation Setup](#auto-instrumentation-setup)
+6. [Manual Instrumentation](#manual-instrumentation)
+7. [Exporters](#exporters)
+8. [Docker Integration](#docker-integration)
+9. [Verifying Setup](#verifying-setup)
+10. [Performance Considerations](#performance-considerations)
+11. [Troubleshooting](#troubleshooting)
+
+## Introduction
+
+OpenTelemetry provides standardized observability instrumentation for Tika Server, enabling:
+
+- **Distributed Tracing**: End-to-end request flows from HTTP ingestion to parser execution
+- **Metrics**: Throughput, error rates, and resource usage
+- **Structured Logs**: Correlated with traces via trace/span IDs
+
+### Why OpenTelemetry?
+
+- **Vendor-neutral**: Works with Jaeger, Zipkin, Prometheus, Grafana, and many others
+- **Future-proof**: Semantic conventions ensure compatibility with evolving backends
+- **Low overhead**: Configurable sampling and async export minimize performance impact
+- **Rich ecosystem**: Integrates with modern observability platforms
+
+## Prerequisites
+
+- **Java**: 11 or higher
+- **Apache Tika Server**: 4.0.0-SNAPSHOT or later
+- **OpenTelemetry Collector or Backend**: Jaeger, Zipkin, or OTLP-compatible collector
+
+## Quick Start
+
+### 1. Enable OpenTelemetry via Environment Variables
+
+The simplest way to enable OpenTelemetry is through environment variables:
+
+```bash
+# For manual instrumentation (uses gRPC by default)
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+export OTEL_SERVICE_NAME=my-tika-server
+
+# For auto-instrumentation with Java agent (uses HTTP by default)
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
+export OTEL_SERVICE_NAME=my-tika-server
+```
+
+**Port Reference:**
+- **4317**: OTLP gRPC endpoint (default for manual instrumentation)
+- **4318**: OTLP HTTP endpoint (default for Java agent auto-instrumentation)
+
+Setting `OTEL_EXPORTER_OTLP_ENDPOINT` automatically enables OpenTelemetry.
+
+### 2. Start a Local Jaeger Instance
+
+```bash
+cd tika-server/tika-server-standard/docker
+docker-compose -f docker-compose-otel.yml up -d jaeger
+```
+
+Jaeger UI will be available at: http://localhost:16686
+
+### 3. Start Tika Server
+
+```bash
+java -jar tika-server-standard/target/tika-server-standard-*.jar
+```
+
+### 4. Send Test Requests
+
+**Important**: Include the `File-Name` header to properly populate the `tika.resource_name` span attribute:
+
+```bash
+# Parse a document
+curl -T mydocument.pdf \
+ -H "File-Name: mydocument.pdf" \
+ http://localhost:9998/tika
+
+# Detect MIME type
+curl -T sample.txt \
+ -H "File-Name: sample.txt" \
+ http://localhost:9998/detect/stream
+
+# Extract metadata
+curl -T mydocument.pdf \
+ -H "File-Name: mydocument.pdf" \
+ http://localhost:9998/meta
+
+# Alternative: Use Content-Disposition header (standard HTTP)
+curl -T document.docx \
+ -H "Content-Disposition: attachment; filename=document.docx" \
+ http://localhost:9998/tika
+
+# Or use multipart form upload (filename included automatically)
+curl -F "file=@mydocument.pdf" http://localhost:9998/tika/form
+```
+
+### 5. View Traces
+
+Open http://localhost:16686, select "my-tika-server" from the service dropdown, and click "Find Traces".
+
+## Configuration
+
+### Environment Variables
+
+OpenTelemetry can be configured entirely through environment variables:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `OTEL_SDK_DISABLED` | Disable OpenTelemetry completely | `false` |
+| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP endpoint URL | `http://localhost:4317` (gRPC) or `http://localhost:4318` (HTTP) |
+| `OTEL_EXPORTER_OTLP_PROTOCOL` | Protocol to use | `grpc` (manual) or `http/protobuf` (agent) |
+| `OTEL_SERVICE_NAME` | Service name for identification | `tika-server` |
+| `OTEL_TRACES_SAMPLER` | Sampling strategy | `parentbased_always_on` |
+| `OTEL_TRACES_SAMPLER_ARG` | Sampling probability (0.0-1.0) | `1.0` |
+
+**Example:**
+
+```bash
+export OTEL_EXPORTER_OTLP_ENDPOINT=https://my-collector.example.com:4317
+export OTEL_SERVICE_NAME=production-tika-server
+export OTEL_TRACES_SAMPLER=traceidratio
+export OTEL_TRACES_SAMPLER_ARG=0.1 # Sample 10% of traces
+```
+
+### XML Configuration
+
+Alternatively, configure via `tika-server-config.xml`:
+
+```xml
+
+
+
+ 9998
+
+
+
+ true
+ otlp
+ http://localhost:4317
+ tika-server
+ 1.0
+ 30000
+
+
+
+```
+
+**Note:** Environment variables take precedence over XML configuration.
+
+### Configuration Options
+
+- **enabled**: Enable/disable OpenTelemetry (`true`/`false`)
+- **exporterType**: Currently only `otlp` is supported
+- **otlpEndpoint**: OTLP gRPC endpoint URL
+- **serviceName**: Identifier for this Tika Server instance
+- **samplingProbability**: Fraction of traces to sample (0.0 to 1.0)
+- **exportTimeoutMillis**: Timeout for exporting telemetry data
+
+## Auto-Instrumentation Setup
+
+Auto-instrumentation provides automatic tracing for HTTP requests and other framework-level operations.
+
+### Download the OpenTelemetry Java Agent
+
+```bash
+cd tika-server/tika-server-standard/otel-agent
+curl -L -O https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar
+```
+
+### Run Tika Server with Auto-Instrumentation
+
+```bash
+# Using HTTP protocol (default for Java agent) - port 4318
+java -javaagent:otel-agent/opentelemetry-javaagent.jar \
+ -Dotel.service.name=tika-server \
+ -Dotel.exporter.otlp.endpoint=http://localhost:4318 \
+ -jar tika-server-standard/target/tika-server-standard-*.jar
+
+# Or using gRPC protocol - port 4317
+java -javaagent:otel-agent/opentelemetry-javaagent.jar \
+ -Dotel.service.name=tika-server \
+ -Dotel.exporter.otlp.endpoint=http://localhost:4317 \
+ -Dotel.exporter.otlp.protocol=grpc \
+ -jar tika-server-standard/target/tika-server-standard-*.jar
+```
+
+**Important**: The Java agent defaults to HTTP protocol (port 4318). If using gRPC (port 4317), you must specify `-Dotel.exporter.otlp.protocol=grpc`.
+
+### What Gets Instrumented Automatically?
+
+- HTTP server requests (Jetty/CXF)
+- HTTP client calls (if Tika makes outbound HTTP requests)
+- JDBC database operations (if using database features)
+- JVM metrics (memory, GC, threads, CPU)
+
+See [otel-agent/README.md](tika-server-standard/otel-agent/README.md) for more details.
+
+## Manual Instrumentation
+
+Tika Server includes manual instrumentation for Tika-specific operations:
+
+### Instrumented Endpoints
+
+| Endpoint | Span Name | Attributes |
+|----------|-----------|------------|
+| `/tika` | `tika.parse` | `tika.resource_name`, `tika.content_type`, `tika.endpoint` |
+| `/detect` | `tika.detect` | `tika.resource_name`, `tika.detected_type`, `tika.endpoint` |
+| `/meta` | `tika.metadata.extract` | `tika.resource_name`, `tika.metadata_count`, `tika.endpoint` |
+
+### Span Attributes
+
+- **tika.resource_name**: Filename or resource being processed
+ - Extracted from `File-Name` request header
+ - Or from `Content-Disposition: attachment; filename=...` header
+ - Automatically populated when using multipart form uploads
+ - Displays as "unknown" if no filename header is provided
+- **tika.content_type**: Detected MIME type
+- **tika.detected_type**: Result of MIME detection
+- **tika.metadata_count**: Number of metadata fields extracted
+- **tika.endpoint**: API endpoint invoked
+
+### Error Handling
+
+Exceptions are automatically recorded in spans with:
+- Exception type and message
+- Stack traces
+- Span status set to ERROR
+
+## Exporters
+
+### OTLP (Recommended)
+
+OTLP (OpenTelemetry Protocol) is the native protocol and recommended exporter. OTLP supports two transport protocols:
+
+#### gRPC (Port 4317)
+- **Used by**: Manual instrumentation (default in our code)
+- **Protocol**: Binary, efficient
+- **Configuration:**
+
+```bash
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+# Protocol is gRPC by default for manual instrumentation
+```
+
+#### HTTP/Protobuf (Port 4318)
+- **Used by**: Java agent auto-instrumentation (default)
+- **Protocol**: HTTP with protobuf encoding
+- **Configuration:**
+
+```bash
+# For Java agent (HTTP is default)
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
+
+# Or explicitly specify protocol
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
+export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
+```
+
+**Supported Backends:**
+- OpenTelemetry Collector
+- Jaeger (v1.35+)
+- Grafana Tempo
+- Grafana Cloud
+- Honeycomb
+- Lightstep
+- New Relic
+- Many others
+
+### Jaeger
+
+Direct export to Jaeger using OTLP:
+
+```bash
+docker run -d --name jaeger \
+ -p 16686:16686 \
+ -p 4317:4317 \
+ jaegertracing/all-in-one:latest
+```
+
+Access Jaeger UI: http://localhost:16686
+
+### Grafana Cloud
+
+```bash
+export OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp
+export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic "
+```
+
+### Console Exporter (Development)
+
+For development/debugging, use the console exporter:
+
+```java
+// In TikaOpenTelemetry.java, replace OtlpGrpcSpanExporter with:
+LoggingSpanExporter spanExporter = LoggingSpanExporter.create();
+```
+
+## Docker Integration
+
+### Using Docker Compose
+
+Start Tika Server with observability stack:
+
+```bash
+cd tika-server/tika-server-standard/docker
+docker-compose -f docker-compose-otel.yml up -d
+```
+
+This starts:
+- Jaeger (traces + UI)
+- Optionally: Prometheus (metrics)
+
+### Running Tika Server in Docker
+
+```bash
+docker run -d \
+ --name tika-server \
+ --network tika-otel_tika-otel \
+ -e OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4317 \
+ -e OTEL_SERVICE_NAME=tika-server \
+ -p 9998:9998 \
+ apache/tika:latest
+```
+
+### Kubernetes / Helm
+
+For production Kubernetes deployments, use the OpenTelemetry Operator:
+
+```yaml
+apiVersion: opentelemetry.io/v1alpha1
+kind: Instrumentation
+metadata:
+ name: tika-instrumentation
+spec:
+ exporter:
+ endpoint: http://otel-collector:4317
+ propagators:
+ - tracecontext
+ - baggage
+ sampler:
+ type: parentbased_traceidratio
+ argument: "0.25"
+```
+
+Apply to Tika Server deployment with annotation:
+```yaml
+metadata:
+ annotations:
+ instrumentation.opentelemetry.io/inject-java: "true"
+```
+
+See the Tika Helm chart documentation for more details.
+
+## Verifying Setup
+
+### 1. Check Tika Server Logs
+
+Look for initialization messages:
+
+```
+INFO [main] o.a.t.s.c.TikaOpenTelemetry - Initializing OpenTelemetry: TikaOpenTelemetryConfig{enabled=true, ...}
+INFO [main] o.a.t.s.c.TikaOpenTelemetry - OpenTelemetry initialized successfully
+```
+
+### 2. Send Test Requests
+
+**Important**: Always include filename headers to populate the `tika.resource_name` attribute in traces.
+
+```bash
+# Create a test file
+echo "Hello OpenTelemetry" > test.txt
+
+# Parse with Tika (include File-Name header)
+curl -T test.txt \
+ -H "File-Name: test.txt" \
+ http://localhost:9998/tika
+
+# Detect MIME type
+curl -T test.txt \
+ -H "File-Name: test.txt" \
+ http://localhost:9998/detect/stream
+
+# Extract metadata
+curl -T test.txt \
+ -H "File-Name: test.txt" \
+ http://localhost:9998/meta
+
+# Alternative methods to include filename:
+
+# Method 1: Content-Disposition header (HTTP standard)
+curl -T test.txt \
+ -H "Content-Disposition: attachment; filename=test.txt" \
+ http://localhost:9998/tika
+
+# Method 2: Multipart form upload (filename automatic)
+curl -F "file=@test.txt" http://localhost:9998/tika/form
+```
+
+**Why include filename headers?**
+Without the filename header, the `tika.resource_name` span attribute will show as "unknown", making it harder to identify which document was processed in traces.
+
+### 3. View in Jaeger
+
+1. Open http://localhost:16686
+2. Service: Select your service name (e.g., "tika-server")
+3. Click "Find Traces"
+4. Click on a trace to see detailed spans
+
+### Expected Span Structure
+
+**With Auto-Instrumentation (Java agent):**
+```
+HTTP PUT /tika
+ └─ tika.parse
+ Attributes:
+ - tika.resource_name: test.txt ← From File-Name header
+ - tika.content_type: text/plain ← Auto-detected by Tika
+ - tika.endpoint: /tika
+ - span.status: OK
+```
+
+**Manual Instrumentation Only:**
+```
+tika.parse
+ Attributes:
+ - tika.resource_name: mydocument.pdf
+ - tika.content_type: application/pdf
+ - tika.endpoint: /tika
+```
+
+**Note**: If the filename header is missing, `tika.resource_name` will show as "unknown".
+
+## Performance Considerations
+
+### Overhead
+
+OpenTelemetry adds minimal overhead when properly configured:
+- **Disabled**: No overhead
+- **Enabled with sampling**: 1-3% typical overhead
+- **Enabled without sampling**: 3-5% worst-case overhead
+
+### Sampling Strategies
+
+**Always On** (Default):
+```bash
+export OTEL_TRACES_SAMPLER=always_on
+```
+Captures every trace. Good for development and low-traffic services.
+
+**Probability-Based**:
+```bash
+export OTEL_TRACES_SAMPLER=traceidratio
+export OTEL_TRACES_SAMPLER_ARG=0.1 # 10% sampling
+```
+Samples a percentage of traces. Reduces overhead and storage costs.
+
+**Parent-Based** (Recommended):
+```bash
+export OTEL_TRACES_SAMPLER=parentbased_traceidratio
+export OTEL_TRACES_SAMPLER_ARG=0.1
+```
+Respects parent trace sampling decisions (for distributed tracing).
+
+### Async Export
+
+Telemetry data is exported asynchronously in batches, preventing blocking of request processing.
+
+### Resource Limits
+
+The OpenTelemetry SDK uses bounded queues to prevent memory issues:
+- Default queue size: 2048 spans
+- Spans are dropped if queue is full (counted in metrics)
+
+## Troubleshooting
+
+### Traces Not Appearing
+
+**Problem**: No traces visible in Jaeger/backend.
+
+**Solutions**:
+
+1. **Check OpenTelemetry is enabled:**
+ ```bash
+ grep "OpenTelemetry" tika-server.log
+ ```
+ Should see "OpenTelemetry initialized successfully".
+
+2. **Verify endpoint is reachable:**
+ ```bash
+ telnet localhost 4317
+ ```
+
+3. **Check for errors in Tika logs:**
+ ```bash
+ grep "ERROR.*OpenTelemetry" tika-server.log
+ ```
+
+4. **Verify backend is running:**
+ ```bash
+ docker ps | grep jaeger
+ ```
+
+### Connection Refused
+
+**Problem**: `Connection refused` to OTLP endpoint.
+
+**Solutions**:
+
+1. **Start Jaeger/collector:**
+ ```bash
+ docker-compose -f docker/docker-compose-otel.yml up -d jaeger
+ ```
+
+2. **Verify correct port for your protocol:**
+ - **Manual instrumentation (gRPC)**: Use port `4317`
+ - **Auto-instrumentation (HTTP)**: Use port `4318`
+
+ ```bash
+ # Manual:
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+
+ # Agent:
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
+ ```
+
+3. **Check firewall rules:**
+ Ensure ports 4317 (gRPC) and 4318 (HTTP) are not blocked.
+
+4. **Use correct hostname:**
+ - Local: `http://localhost:4317` or `http://localhost:4318`
+ - Docker: `http://jaeger:4317` or `http://jaeger:4318`
+ - Docker Desktop: `http://host.docker.internal:4317` or `http://host.docker.internal:4318`
+
+### Wrong Port/Protocol
+
+**Problem**: Warning in logs: "OTLP exporter endpoint port is likely incorrect for protocol version..."
+
+**Cause**: Port and protocol mismatch.
+
+**Solution**: Match the port to the protocol:
+
+```bash
+# If using gRPC (manual instrumentation):
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+export OTEL_EXPORTER_OTLP_PROTOCOL=grpc
+
+# If using HTTP (Java agent default):
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
+# HTTP is the default for agent, no need to specify protocol
+```
+
+**Quick Reference:**
+- gRPC → Port 4317
+- HTTP → Port 4318
+
+### High Overhead
+
+**Problem**: Tika Server performance degraded after enabling OTEL.
+
+**Solutions**:
+
+1. **Enable sampling:**
+ ```bash
+ export OTEL_TRACES_SAMPLER=traceidratio
+ export OTEL_TRACES_SAMPLER_ARG=0.1
+ ```
+
+2. **Disable auto-instrumentation:**
+ Remove `-javaagent` flag if only manual instrumentation is needed.
+
+3. **Increase export batch size:**
+ Reduces export frequency (in code: `BatchSpanProcessor.builder().setMaxExportBatchSize(512)`).
+
+### Spans Missing Attributes
+
+**Problem**: Spans don't show expected attributes (e.g., `tika.resource_name` shows "unknown").
+
+**Causes**:
+- **Missing filename header**: The `File-Name` or `Content-Disposition` header was not included in the request
+- **Attributes are null or not set**
+
+**Solutions**:
+
+1. **Include filename in requests:**
+ ```bash
+ # Add File-Name header
+ curl -T document.pdf -H "File-Name: document.pdf" http://localhost:9998/tika
+
+ # Or use Content-Disposition
+ curl -T document.pdf -H "Content-Disposition: attachment; filename=document.pdf" http://localhost:9998/tika
+
+ # Or use multipart form
+ curl -F "file=@document.pdf" http://localhost:9998/tika/form
+ ```
+
+2. **Check Tika Server logs for warnings:**
+ ```bash
+ grep "WARN" tika-server.log | grep -i metadata
+ ```
+
+### Duplicate Spans
+
+**Problem**: Seeing duplicate spans for the same operation.
+
+**Cause**: Both auto and manual instrumentation creating spans.
+
+**Solution**: This is expected. Auto-instrumentation creates HTTP-level spans, manual creates Tika-specific spans. They should be nested, not duplicated.
+
+## Further Reading
+
+- [OpenTelemetry Official Documentation](https://opentelemetry.io/docs/)
+- [OpenTelemetry Java SDK](https://opentelemetry.io/docs/instrumentation/java/)
+- [OTLP Specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/otlp.md)
+- [Semantic Conventions](https://github.com/open-telemetry/semantic-conventions)
+- [Jaeger Documentation](https://www.jaegertracing.io/docs/)
+- [Tika Wiki](https://cwiki.apache.org/confluence/display/TIKA)
+
+## Contributing
+
+To add more instrumentation to Tika Server:
+
+1. Import OpenTelemetry API:
+ ```java
+ import io.opentelemetry.api.trace.Span;
+ import io.opentelemetry.api.trace.Tracer;
+ ```
+
+2. Get tracer instance:
+ ```java
+ Tracer tracer = TikaOpenTelemetry.getTracer();
+ ```
+
+3. Create spans:
+ ```java
+ Span span = tracer.spanBuilder("operation.name")
+ .setAttribute("key", "value")
+ .startSpan();
+ try {
+ // Your code
+ span.setStatus(StatusCode.OK);
+ } catch (Exception e) {
+ span.recordException(e);
+ span.setStatus(StatusCode.ERROR);
+ } finally {
+ span.end();
+ }
+ ```
+
+See existing instrumentation in `TikaResource.java`, `DetectorResource.java`, and `MetadataResource.java` for examples.
diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml
index 8dc3c5a825..18af603eb2 100644
--- a/tika-server/tika-server-core/pom.xml
+++ b/tika-server/tika-server-core/pom.xml
@@ -119,6 +119,44 @@
org.apache.logging.log4j
log4j-slf4j2-impl
+
+
+
+ io.opentelemetry
+ opentelemetry-api
+ 1.55.0
+ true
+
+
+ io.opentelemetry
+ opentelemetry-sdk
+ 1.55.0
+ true
+
+
+ io.opentelemetry
+ opentelemetry-exporter-otlp
+ 1.55.0
+ true
+
+
+ io.opentelemetry.instrumentation
+ opentelemetry-instrumentation-annotations
+ 2.20.1
+ true
+
+
+ io.opentelemetry.instrumentation
+ opentelemetry-log4j-appender-2.17
+ 2.20.1-alpha
+ true
+
+
+ io.opentelemetry
+ opentelemetry-sdk-testing
+ 1.55.0
+ test
+
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java
new file mode 100644
index 0000000000..b69e70cec3
--- /dev/null
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.core;
+
+import java.time.Duration;
+import java.util.concurrent.TimeUnit;
+
+import io.opentelemetry.api.GlobalOpenTelemetry;
+import io.opentelemetry.api.OpenTelemetry;
+import io.opentelemetry.api.common.AttributeKey;
+import io.opentelemetry.api.common.Attributes;
+import io.opentelemetry.api.metrics.Meter;
+import io.opentelemetry.api.trace.Tracer;
+import io.opentelemetry.exporter.otlp.metrics.OtlpGrpcMetricExporter;
+import io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter;
+import io.opentelemetry.sdk.OpenTelemetrySdk;
+import io.opentelemetry.sdk.metrics.SdkMeterProvider;
+import io.opentelemetry.sdk.metrics.export.PeriodicMetricReader;
+import io.opentelemetry.sdk.resources.Resource;
+import io.opentelemetry.sdk.trace.SdkTracerProvider;
+import io.opentelemetry.sdk.trace.export.BatchSpanProcessor;
+import io.opentelemetry.sdk.trace.samplers.Sampler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Singleton class for managing OpenTelemetry instrumentation in Tika Server.
+ *
+ * This class provides centralized management of the OpenTelemetry SDK lifecycle,
+ * including initialization, configuration, and graceful shutdown. It exposes
+ * {@link Tracer} and {@link Meter} instances for creating custom spans and metrics
+ * throughout the Tika Server codebase.
+ *
+ *
+ * The class automatically detects when the OpenTelemetry Java agent is present
+ * and defers to the agent's configuration in that case, enabling both manual
+ * and auto-instrumentation to coexist harmoniously.
+ *
+ *
+ * Usage example:
+ *
{@code
+ * // Initialize during server startup
+ * TikaOpenTelemetry.initialize(config);
+ *
+ * // Create custom spans
+ * Tracer tracer = TikaOpenTelemetry.getTracer();
+ * Span span = tracer.spanBuilder("custom.operation").startSpan();
+ * try {
+ * // Your code here
+ * span.setStatus(StatusCode.OK);
+ * } finally {
+ * span.end();
+ * }
+ *
+ * // Shutdown during server teardown
+ * TikaOpenTelemetry.shutdown();
+ * }
+ *
+ *
+ * @since 4.0.0
+ */
+public class TikaOpenTelemetry {
+
+ private static final Logger LOG = LoggerFactory.getLogger(TikaOpenTelemetry.class);
+
+ /** Instrumentation library name for Tika Server */
+ private static final String INSTRUMENTATION_NAME = "org.apache.tika.server";
+
+ /** Instrumentation library version */
+ private static final String INSTRUMENTATION_VERSION = "1.0.0";
+
+ private static volatile OpenTelemetry openTelemetry = null;
+ private static volatile Tracer tracer = null;
+ private static volatile Meter meter = null;
+ private static volatile boolean initialized = false;
+ private static volatile boolean enabled = false;
+ private static volatile SdkTracerProvider tracerProvider = null;
+ private static volatile SdkMeterProvider meterProvider = null;
+
+ /**
+ * Initializes OpenTelemetry with the provided configuration.
+ *
+ * This method is thread-safe and will only initialize once. Subsequent calls
+ * will log a warning and return immediately.
+ *
+ *
+ * The initialization process:
+ *
+ * - Checks if OpenTelemetry Java agent is already active (auto-instrumentation)
+ * - If agent present, uses the agent's global instance
+ * - Otherwise, creates and registers a new OpenTelemetry SDK instance
+ * - Configures OTLP exporters for traces and metrics
+ * - Sets up resource attributes (service name, version)
+ * - Configures sampling based on provided probability
+ *
+ *
+ *
+ * If configuration has {@code enabled=false}, OpenTelemetry remains disabled
+ * and noop implementations are returned by {@link #getTracer()} and {@link #getMeter()}.
+ *
+ *
+ * @param config the OpenTelemetry configuration settings
+ */
+ public static synchronized void initialize(TikaOpenTelemetryConfig config) {
+ if (initialized) {
+ LOG.warn("OpenTelemetry already initialized, skipping");
+ return;
+ }
+
+ if (!config.isEnabled()) {
+ LOG.info("OpenTelemetry is disabled");
+ initialized = true;
+ enabled = false;
+ return;
+ }
+
+ try {
+ LOG.info("Initializing OpenTelemetry: {}", config);
+
+ Resource resource = Resource.getDefault().merge(Resource.create(
+ Attributes.builder()
+ .put(AttributeKey.stringKey("service.name"), config.getServiceName())
+ .put(AttributeKey.stringKey("service.version"), INSTRUMENTATION_VERSION)
+ .build()));
+
+ // Configure tracer provider
+ OtlpGrpcSpanExporter spanExporter = OtlpGrpcSpanExporter.builder()
+ .setEndpoint(config.getOtlpEndpoint())
+ .setTimeout(Duration.ofMillis(config.getExportTimeoutMillis()))
+ .build();
+
+ Sampler sampler = Sampler.traceIdRatioBased(config.getSamplingProbability());
+
+ tracerProvider = SdkTracerProvider.builder()
+ .addSpanProcessor(BatchSpanProcessor.builder(spanExporter).build())
+ .setResource(resource)
+ .setSampler(sampler)
+ .build();
+
+ // Configure meter provider
+ OtlpGrpcMetricExporter metricExporter = OtlpGrpcMetricExporter.builder()
+ .setEndpoint(config.getOtlpEndpoint())
+ .setTimeout(Duration.ofMillis(config.getExportTimeoutMillis()))
+ .build();
+
+ meterProvider = SdkMeterProvider.builder()
+ .registerMetricReader(
+ PeriodicMetricReader.builder(metricExporter)
+ .setInterval(Duration.ofSeconds(60))
+ .build())
+ .setResource(resource)
+ .build();
+
+ // Build and register OpenTelemetry SDK globally
+ // This may fail if the Java agent has already set the global instance
+ try {
+ openTelemetry = OpenTelemetrySdk.builder()
+ .setTracerProvider(tracerProvider)
+ .setMeterProvider(meterProvider)
+ .buildAndRegisterGlobal();
+
+ tracer = openTelemetry.getTracer(INSTRUMENTATION_NAME, INSTRUMENTATION_VERSION);
+ meter = openTelemetry.getMeter(INSTRUMENTATION_NAME);
+
+ LOG.info("OpenTelemetry initialized successfully");
+ } catch (IllegalStateException e) {
+ // Java agent has already set the global OpenTelemetry
+ LOG.info("OpenTelemetry Java agent detected. Using agent's configuration (service.name from -Dotel.service.name)");
+
+ // Use the agent's global instance
+ OpenTelemetry agentOtel = GlobalOpenTelemetry.get();
+ tracer = agentOtel.getTracer(INSTRUMENTATION_NAME, INSTRUMENTATION_VERSION);
+ meter = agentOtel.getMeter(INSTRUMENTATION_NAME);
+
+ // Clean up our providers since we're using the agent's
+ if (tracerProvider != null) {
+ tracerProvider.close();
+ }
+ if (meterProvider != null) {
+ meterProvider.close();
+ }
+ tracerProvider = null;
+ meterProvider = null;
+ openTelemetry = agentOtel;
+ }
+
+ initialized = true;
+ enabled = true;
+
+ } catch (Exception e) {
+ LOG.error("Failed to initialize OpenTelemetry", e);
+ initialized = true;
+ enabled = false;
+ }
+ }
+
+ /**
+ * Returns the OpenTelemetry tracer instance for creating custom spans.
+ *
+ * The tracer can be used to instrument custom operations and add
+ * application-specific spans to traces. If OpenTelemetry is not enabled
+ * or not initialized, returns a noop tracer that performs no operations.
+ *
+ *
+ * Example usage:
+ *
{@code
+ * Tracer tracer = TikaOpenTelemetry.getTracer();
+ * Span span = tracer.spanBuilder("custom.operation")
+ * .setAttribute("key", "value")
+ * .startSpan();
+ * try (Scope scope = span.makeCurrent()) {
+ * // Your instrumented code
+ * span.setStatus(StatusCode.OK);
+ * } finally {
+ * span.end();
+ * }
+ * }
+ *
+ *
+ * @return Tracer instance for creating spans, never null
+ */
+ public static Tracer getTracer() {
+ if (!enabled || tracer == null) {
+ return OpenTelemetry.noop().getTracer(INSTRUMENTATION_NAME);
+ }
+ return tracer;
+ }
+
+ /**
+ * Returns the OpenTelemetry meter instance for recording metrics.
+ *
+ * The meter can be used to create counters, gauges, and histograms for
+ * recording custom metrics. If OpenTelemetry is not enabled or not
+ * initialized, returns a noop meter that performs no operations.
+ *
+ *
+ * Example usage:
+ *
{@code
+ * Meter meter = TikaOpenTelemetry.getMeter();
+ * LongCounter counter = meter.counterBuilder("parse.count")
+ * .setDescription("Number of documents parsed")
+ * .build();
+ * counter.add(1, Attributes.builder()
+ * .put("content_type", "application/pdf")
+ * .build());
+ * }
+ *
+ *
+ * @return Meter instance for recording metrics, never null
+ */
+ public static Meter getMeter() {
+ if (!enabled || meter == null) {
+ return OpenTelemetry.noop().getMeter(INSTRUMENTATION_NAME);
+ }
+ return meter;
+ }
+
+ /**
+ * Checks if OpenTelemetry instrumentation is enabled and active.
+ *
+ * This method can be used to conditionally create spans or metrics only
+ * when OpenTelemetry is active, though it's generally not necessary as
+ * {@link #getTracer()} and {@link #getMeter()} return noop implementations
+ * when disabled.
+ *
+ *
+ * @return true if OpenTelemetry is enabled and initialized, false otherwise
+ */
+ public static boolean isEnabled() {
+ return enabled;
+ }
+
+ /**
+ * Gracefully shuts down the OpenTelemetry SDK and flushes any pending telemetry data.
+ *
+ * This method should be called during application shutdown to ensure all
+ * traces and metrics are properly exported before the process terminates.
+ * The shutdown process:
+ *
+ * - Stops accepting new spans and metrics
+ * - Flushes all pending data to the configured exporter
+ * - Waits up to 10 seconds for export to complete
+ * - Releases all resources
+ *
+ *
+ *
+ * This method is thread-safe and idempotent. If OpenTelemetry is not initialized
+ * or already shutdown, this method returns immediately without error.
+ *
+ *
+ * Note: If the OpenTelemetry Java agent is in use, this method
+ * will not shut down the agent's SDK, only the manually created providers (if any).
+ *
+ */
+ public static synchronized void shutdown() {
+ if (!initialized || !enabled) {
+ return;
+ }
+
+ LOG.info("Shutting down OpenTelemetry");
+
+ try {
+ if (tracerProvider != null) {
+ tracerProvider.shutdown().join(10, TimeUnit.SECONDS);
+ }
+ if (meterProvider != null) {
+ meterProvider.shutdown().join(10, TimeUnit.SECONDS);
+ }
+ LOG.info("OpenTelemetry shut down successfully");
+ } catch (Exception e) {
+ LOG.error("Error shutting down OpenTelemetry", e);
+ } finally {
+ enabled = false;
+ }
+ }
+
+ /**
+ * Get the OpenTelemetry instance.
+ *
+ * @return OpenTelemetry instance
+ */
+ public static OpenTelemetry getOpenTelemetry() {
+ if (!enabled || openTelemetry == null) {
+ return OpenTelemetry.noop();
+ }
+ return openTelemetry;
+ }
+}
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java
new file mode 100644
index 0000000000..2cc2af0a2b
--- /dev/null
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.core;
+
+import java.util.Map;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * Configuration for OpenTelemetry instrumentation in Tika Server.
+ *
+ * This class encapsulates all OpenTelemetry-related configuration settings,
+ * including enabling/disabling instrumentation, OTLP endpoint configuration,
+ * service identification, and sampling policies.
+ *
+ *
+ * Configuration can be loaded from:
+ *
+ * - Environment variables (takes precedence)
+ * - XML configuration via TikaServerConfig
+ *
+ *
+ *
+ * Example XML configuration:
+ *
{@code
+ *
+ * true
+ * otlp
+ * http://localhost:4317
+ * tika-server
+ * 1.0
+ * 30000
+ *
+ * }
+ *
+ *
+ * @since 4.0.0
+ */
+public class TikaOpenTelemetryConfig implements Initializable {
+
+ /** Environment variable to disable the OpenTelemetry SDK */
+ public static final String OTEL_SDK_DISABLED_ENV = "OTEL_SDK_DISABLED";
+
+ /** Environment variable for the OTLP exporter endpoint URL */
+ public static final String OTEL_EXPORTER_OTLP_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_ENDPOINT";
+
+ /** Environment variable for the service name identifier */
+ public static final String OTEL_SERVICE_NAME_ENV = "OTEL_SERVICE_NAME";
+
+ /** Environment variable for the trace sampling strategy */
+ public static final String OTEL_TRACES_SAMPLER_ENV = "OTEL_TRACES_SAMPLER";
+
+ /** Environment variable for the trace sampling probability */
+ public static final String OTEL_TRACES_SAMPLER_ARG_ENV = "OTEL_TRACES_SAMPLER_ARG";
+
+ private static final String DEFAULT_SERVICE_NAME = "tika-server";
+ private static final String DEFAULT_OTLP_ENDPOINT = "http://localhost:4317";
+ private static final double DEFAULT_SAMPLING_PROBABILITY = 1.0;
+
+ private boolean enabled = false;
+ private String exporterType = "otlp";
+ private String otlpEndpoint = DEFAULT_OTLP_ENDPOINT;
+ private String serviceName = DEFAULT_SERVICE_NAME;
+ private double samplingProbability = DEFAULT_SAMPLING_PROBABILITY;
+ private int exportTimeoutMillis = 30000;
+
+ /**
+ * Creates a new OpenTelemetry configuration instance.
+ * Automatically loads configuration from environment variables.
+ */
+ public TikaOpenTelemetryConfig() {
+ loadFromEnvironment();
+ }
+
+ /**
+ * Loads configuration from standard OpenTelemetry environment variables.
+ * This method is called automatically during construction.
+ *
+ * Recognized environment variables:
+ *
+ * - {@code OTEL_SDK_DISABLED}: Set to "true" to disable OpenTelemetry
+ * - {@code OTEL_EXPORTER_OTLP_ENDPOINT}: OTLP endpoint URL (enables OpenTelemetry if set)
+ * - {@code OTEL_SERVICE_NAME}: Service name for identifying this Tika instance
+ * - {@code OTEL_TRACES_SAMPLER_ARG}: Sampling probability (0.0-1.0)
+ *
+ *
+ */
+ private void loadFromEnvironment() {
+ String sdkDisabled = System.getenv(OTEL_SDK_DISABLED_ENV);
+ if ("true".equalsIgnoreCase(sdkDisabled)) {
+ this.enabled = false;
+ }
+
+ String endpoint = System.getenv(OTEL_EXPORTER_OTLP_ENDPOINT_ENV);
+ if (!StringUtils.isBlank(endpoint)) {
+ this.otlpEndpoint = endpoint;
+ if (sdkDisabled == null) {
+ this.enabled = true;
+ }
+ }
+
+ String serviceName = System.getenv(OTEL_SERVICE_NAME_ENV);
+ if (!StringUtils.isBlank(serviceName)) {
+ this.serviceName = serviceName;
+ }
+
+ String samplerArg = System.getenv(OTEL_TRACES_SAMPLER_ARG_ENV);
+ if (!StringUtils.isBlank(samplerArg)) {
+ try {
+ this.samplingProbability = Double.parseDouble(samplerArg);
+ } catch (NumberFormatException e) {
+ // Keep default
+ }
+ }
+ }
+
+ /**
+ * Returns whether OpenTelemetry instrumentation is enabled.
+ *
+ * @return true if OpenTelemetry is enabled, false otherwise
+ */
+ public boolean isEnabled() {
+ return enabled;
+ }
+
+ /**
+ * Sets whether OpenTelemetry instrumentation is enabled.
+ *
+ * @param enabled true to enable OpenTelemetry, false to disable
+ */
+ public void setEnabled(boolean enabled) {
+ this.enabled = enabled;
+ }
+
+ /**
+ * Returns the type of exporter to use for telemetry data.
+ *
+ * @return the exporter type (currently only "otlp" is supported)
+ */
+ public String getExporterType() {
+ return exporterType;
+ }
+
+ /**
+ * Sets the type of exporter to use for telemetry data.
+ *
+ * @param exporterType the exporter type (currently only "otlp" is supported)
+ */
+ public void setExporterType(String exporterType) {
+ this.exporterType = exporterType;
+ }
+
+ /**
+ * Returns the OTLP endpoint URL for exporting telemetry data.
+ *
+ * @return the OTLP endpoint URL (e.g., "http://localhost:4317")
+ */
+ public String getOtlpEndpoint() {
+ return otlpEndpoint;
+ }
+
+ /**
+ * Sets the OTLP endpoint URL for exporting telemetry data.
+ *
+ * Port reference:
+ *
+ * - 4317: gRPC protocol (default for manual instrumentation)
+ * - 4318: HTTP protocol (default for Java agent)
+ *
+ *
+ *
+ * @param otlpEndpoint the OTLP endpoint URL
+ */
+ public void setOtlpEndpoint(String otlpEndpoint) {
+ this.otlpEndpoint = otlpEndpoint;
+ }
+
+ /**
+ * Returns the service name used to identify this Tika Server instance.
+ *
+ * @return the service name
+ */
+ public String getServiceName() {
+ return serviceName;
+ }
+
+ /**
+ * Sets the service name used to identify this Tika Server instance.
+ * This name appears in traces and helps identify different Tika deployments.
+ *
+ * @param serviceName the service name
+ */
+ public void setServiceName(String serviceName) {
+ this.serviceName = serviceName;
+ }
+
+ /**
+ * Returns the trace sampling probability.
+ *
+ * @return the sampling probability (0.0 = sample nothing, 1.0 = sample everything)
+ */
+ public double getSamplingProbability() {
+ return samplingProbability;
+ }
+
+ /**
+ * Sets the trace sampling probability.
+ * This determines what fraction of traces are sampled and exported.
+ *
+ * @param samplingProbability the sampling probability (must be between 0.0 and 1.0)
+ */
+ public void setSamplingProbability(double samplingProbability) {
+ this.samplingProbability = samplingProbability;
+ }
+
+ /**
+ * Returns the timeout in milliseconds for exporting telemetry data.
+ *
+ * @return the export timeout in milliseconds
+ */
+ public int getExportTimeoutMillis() {
+ return exportTimeoutMillis;
+ }
+
+ /**
+ * Sets the timeout in milliseconds for exporting telemetry data.
+ *
+ * @param exportTimeoutMillis the export timeout in milliseconds
+ */
+ public void setExportTimeoutMillis(int exportTimeoutMillis) {
+ this.exportTimeoutMillis = exportTimeoutMillis;
+ }
+
+ @Override
+ public String toString() {
+ return "TikaOpenTelemetryConfig{enabled=" + enabled +
+ ", exporterType=" + exporterType +
+ ", otlpEndpoint=" + otlpEndpoint +
+ ", serviceName=" + serviceName +
+ ", samplingProbability=" + samplingProbability +
+ ", exportTimeoutMillis=" + exportTimeoutMillis + "}";
+ }
+
+ /**
+ * Initializes the configuration after all fields have been set.
+ * This method is called by the Tika configuration framework after
+ * all setter methods have been invoked.
+ *
+ * For this configuration class, no additional initialization is required
+ * as all fields are directly set via setters.
+ *
+ *
+ * @param params configuration parameters (not used in this implementation)
+ * @throws TikaConfigException if initialization fails
+ */
+ @Override
+ public void initialize(Map params) throws TikaConfigException {
+ // All fields are set via setters by ConfigBase before this is called
+ // No additional initialization needed
+ }
+
+ /**
+ * Validates the configuration settings.
+ * This method is called by the Tika configuration framework to ensure
+ * all configuration values are valid and consistent.
+ *
+ * @param problemHandler handler for reporting configuration problems
+ * @throws TikaConfigException if the configuration is invalid
+ */
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+ // Validate configuration if needed
+ if (enabled && StringUtils.isBlank(otlpEndpoint)) {
+ throw new TikaConfigException(
+ "OpenTelemetry is enabled but otlpEndpoint is not configured");
+ }
+ if (samplingProbability < 0.0 || samplingProbability > 1.0) {
+ throw new TikaConfigException(
+ "samplingProbability must be between 0.0 and 1.0, got: " + samplingProbability);
+ }
+ }
+}
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java
index 663e95bebf..1b8217d705 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java
@@ -132,6 +132,8 @@ public class TikaServerConfig extends ConfigBase {
private TlsConfig tlsConfig = new TlsConfig();
+ private TikaOpenTelemetryConfig openTelemetry = new TikaOpenTelemetryConfig();
+
/**
* Config with only the defaults
*/
@@ -568,6 +570,14 @@ public void setTlsConfig(TlsConfig tlsConfig) {
this.tlsConfig = tlsConfig;
}
+ public TikaOpenTelemetryConfig getOpenTelemetry() {
+ return openTelemetry;
+ }
+
+ public void setOpenTelemetry(TikaOpenTelemetryConfig openTelemetry) {
+ this.openTelemetry = openTelemetry;
+ }
+
public List getEndpoints() {
return endpoints;
}
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
index e1afe24918..15fd6a0d56 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
@@ -188,6 +188,14 @@ private static ServerDetails initServer(TikaServerConfig tikaServerConfig) throw
tika = TikaConfig.getDefaultConfig();
}
+ // Initialize OpenTelemetry if enabled
+ TikaOpenTelemetry.initialize(tikaServerConfig.getOpenTelemetry());
+ // Register shutdown hook to gracefully shutdown OpenTelemetry
+ Runtime.getRuntime().addShutdownHook(new Thread(() -> {
+ LOG.info("Shutting down OpenTelemetry...");
+ TikaOpenTelemetry.shutdown();
+ }));
+
DigestingParser.Digester digester = null;
if (!StringUtils.isBlank(tikaServerConfig.getDigest())) {
try {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java
index 51e7ab3d63..ccb786cd1c 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java
@@ -36,6 +36,7 @@
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.server.core.ServerStatus;
+import org.apache.tika.server.core.TikaOpenTelemetry;
@Path("/detect")
public class DetectorResource {
@@ -61,23 +62,55 @@ public String detect(final InputStream is, @Context HttpHeaders httpHeaders, @Co
long timeoutMillis = TikaResource.getTaskTimeout(parseContext);
long taskId = serverStatus.start(ServerStatus.TASK.DETECT, filename, timeoutMillis);
+ // Start OpenTelemetry span for detect operation
+ io.opentelemetry.api.trace.Span span = null;
+ if (TikaOpenTelemetry.isEnabled()) {
+ span = TikaOpenTelemetry.getTracer()
+ .spanBuilder("tika.detect")
+ .setAttribute("tika.resource_name", filename != null ? filename : "unknown")
+ .setAttribute("tika.endpoint", "/detect")
+ .startSpan();
+ }
+
try (TikaInputStream tis = TikaInputStream.get(TikaResource.getInputStream(is, met, httpHeaders, info))) {
- return TikaResource
+ String detectedType = TikaResource
.getConfig()
.getDetector()
.detect(tis, met)
.toString();
+
+ if (span != null) {
+ span.setAttribute("tika.detected_type", detectedType);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.OK);
+ }
+
+ return detectedType;
} catch (IOException e) {
LOG.warn("Unable to detect MIME type for file. Reason: {} ({})", e.getMessage(), filename, e);
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "IO error during detection");
+ }
return MediaType.OCTET_STREAM.toString();
} catch (OutOfMemoryError e) {
LOG.error("OOM while detecting: ({})", filename, e);
serverStatus.setStatus(ServerStatus.STATUS.ERROR);
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Out of memory");
+ }
throw e;
} catch (Throwable e) {
LOG.error("Exception while detecting: ({})", filename, e);
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Detection error");
+ }
throw e;
} finally {
+ if (span != null) {
+ span.end();
+ }
serverStatus.complete(taskId);
}
}
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
index 29e452d945..9a847b8c60 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
@@ -41,8 +41,10 @@
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.language.detect.LanguageHandler;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.server.core.TikaOpenTelemetry;
@Path("/meta")
@@ -137,12 +139,44 @@ protected Metadata parseMetadata(InputStream is, Metadata metadata, MultivaluedM
//no need to parse embedded docs
context.set(DocumentSelector.class, metadata1 -> false);
- TikaResource.logRequest(LOG, "/meta", metadata);
- TikaResource.parse(parser, LOG, info.getPath(), is, new LanguageHandler() {
- public void endDocument() {
- metadata.set("language", getLanguage().getLanguage());
+ String fileName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+
+ // Start OpenTelemetry span for metadata extraction operation
+ io.opentelemetry.api.trace.Span span = null;
+ if (TikaOpenTelemetry.isEnabled()) {
+ span = TikaOpenTelemetry.getTracer()
+ .spanBuilder("tika.metadata.extract")
+ .setAttribute("tika.resource_name", fileName != null ? fileName : "unknown")
+ .setAttribute("tika.endpoint", "/meta")
+ .startSpan();
+ }
+
+ try {
+ TikaResource.logRequest(LOG, "/meta", metadata);
+ TikaResource.parse(parser, LOG, info.getPath(), is, new LanguageHandler() {
+ public void endDocument() {
+ metadata.set("language", getLanguage().getLanguage());
+ }
+ }, metadata, context);
+
+ // Add metadata count to span
+ if (span != null) {
+ span.setAttribute("tika.metadata_count", metadata.names().length);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.OK);
}
- }, metadata, context);
- return metadata;
+
+ return metadata;
+ } catch (IOException e) {
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR,
+ "Metadata extraction error");
+ }
+ throw e;
+ } finally {
+ if (span != null) {
+ span.end();
+ }
+ }
}
}
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 7fc042a7d5..0871bba457 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -84,6 +84,7 @@
import org.apache.tika.server.core.InputStreamFactory;
import org.apache.tika.server.core.ParseContextConfig;
import org.apache.tika.server.core.ServerStatus;
+import org.apache.tika.server.core.TikaOpenTelemetry;
import org.apache.tika.server.core.TikaServerConfig;
import org.apache.tika.server.core.TikaServerParseException;
import org.apache.tika.utils.ExceptionUtils;
@@ -361,23 +362,64 @@ public static void parse(Parser parser, Logger logger, String path, InputStream
long timeoutMillis = getTaskTimeout(parseContext);
long taskId = SERVER_STATUS.start(ServerStatus.TASK.PARSE, fileName, timeoutMillis);
+
+ // Start OpenTelemetry span for parse operation
+ io.opentelemetry.api.trace.Span span = null;
+ if (TikaOpenTelemetry.isEnabled()) {
+ span = TikaOpenTelemetry.getTracer()
+ .spanBuilder("tika.parse")
+ .setAttribute("tika.resource_name", fileName != null ? fileName : "unknown")
+ .setAttribute("tika.endpoint", path != null ? path : "/tika")
+ .startSpan();
+
+ // Add detected content type if available
+ String contentType = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
+ if (contentType != null) {
+ span.setAttribute("tika.content_type", contentType);
+ }
+ }
+
try {
parser.parse(inputStream, handler, metadata, parseContext);
+
+ // Mark span as successful and add output metadata
+ if (span != null) {
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.OK);
+ }
} catch (SAXException e) {
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "SAX parsing error");
+ }
throw new TikaServerParseException(e);
} catch (EncryptedDocumentException e) {
logger.warn("{}: Encrypted document ({})", path, fileName, e);
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Encrypted document");
+ }
throw new TikaServerParseException(e);
} catch (Exception e) {
if (!WriteLimitReachedException.isWriteLimitReached(e)) {
logger.warn("{}: Text extraction failed ({})", path, fileName, e);
}
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Parse error");
+ }
throw new TikaServerParseException(e);
} catch (OutOfMemoryError e) {
logger.warn("{}: OOM ({})", path, fileName, e);
SERVER_STATUS.setStatus(ServerStatus.STATUS.ERROR);
+ if (span != null) {
+ span.recordException(e);
+ span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Out of memory");
+ }
throw e;
} finally {
+ if (span != null) {
+ span.end();
+ }
SERVER_STATUS.complete(taskId);
inputStream.close();
}
diff --git a/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml b/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml
index 007a1be5a2..ab03aed02e 100644
--- a/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml
+++ b/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml
@@ -89,5 +89,33 @@
Not allowed if nofork=true. -->
java
+
+
+
+
+ false
+
+
+ otlp
+
+
+ http://localhost:4317
+
+
+ tika-server
+
+
+ 1.0
+
+
+ 30000
+
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java
new file mode 100644
index 0000000000..53ed6831fc
--- /dev/null
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.core;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import io.opentelemetry.api.trace.Span;
+import io.opentelemetry.api.trace.Tracer;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for OpenTelemetry integration in Tika Server.
+ *
+ * These tests verify the basic functionality of the OpenTelemetry configuration
+ * and initialization, including configuration defaults, environment variable loading,
+ * SDK lifecycle management, and span creation.
+ *
+ *
+ * @since 4.0.0
+ */
+public class TikaOpenTelemetryTest {
+
+ /**
+ * Cleans up OpenTelemetry resources after each test.
+ * Ensures tests don't interfere with each other by shutting down
+ * the SDK between tests.
+ */
+ @AfterEach
+ public void cleanup() {
+ TikaOpenTelemetry.shutdown();
+ }
+
+ /**
+ * Verifies that OpenTelemetry is disabled by default when no
+ * environment variables or explicit configuration is provided.
+ */
+ @Test
+ public void testDisabledByDefault() {
+ // By default, OTEL should be disabled unless configured
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+ assertFalse(config.isEnabled(), "OpenTelemetry should be disabled by default");
+ }
+
+ /**
+ * Verifies that configuration defaults are correctly set to
+ * standard OpenTelemetry values.
+ */
+ @Test
+ public void testConfigurationDefaults() {
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+
+ assertEquals("tika-server", config.getServiceName());
+ assertEquals("http://localhost:4317", config.getOtlpEndpoint());
+ assertEquals(1.0, config.getSamplingProbability());
+ assertEquals(30000, config.getExportTimeoutMillis());
+ assertEquals("otlp", config.getExporterType());
+ }
+
+ /**
+ * Verifies that configuration can load from environment variables.
+ * Note: This is a basic test that verifies object creation.
+ * Full environment variable testing would require system property manipulation.
+ */
+ @Test
+ public void testEnvironmentVariableLoading() {
+ // Note: This test would need to set environment variables before creating config
+ // For now, just verify the config object is created
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+ assertNotNull(config);
+ }
+
+ /**
+ * Tests full OpenTelemetry SDK initialization with valid configuration.
+ * Verifies that the SDK is enabled and a valid Tracer instance is returned.
+ */
+ @Test
+ public void testOpenTelemetryInitialization() {
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+ config.setEnabled(true);
+ config.setOtlpEndpoint("http://localhost:4317");
+
+ TikaOpenTelemetry.initialize(config);
+
+ assertTrue(TikaOpenTelemetry.isEnabled(), "OpenTelemetry should be enabled after initialization");
+
+ Tracer tracer = TikaOpenTelemetry.getTracer();
+ assertNotNull(tracer, "Tracer should not be null");
+ }
+
+ /**
+ * Verifies that when OpenTelemetry is explicitly disabled in configuration,
+ * the SDK remains disabled after initialization.
+ */
+ @Test
+ public void testOpenTelemetryDisabled() {
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+ config.setEnabled(false);
+
+ TikaOpenTelemetry.initialize(config);
+
+ assertFalse(TikaOpenTelemetry.isEnabled(), "OpenTelemetry should remain disabled");
+ }
+
+ /**
+ * Tests the creation of custom spans using the OpenTelemetry Tracer.
+ * Verifies that spans can be created with attributes and properly ended.
+ */
+ @Test
+ public void testSpanCreation() {
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+ config.setEnabled(true);
+
+ TikaOpenTelemetry.initialize(config);
+
+ if (TikaOpenTelemetry.isEnabled()) {
+ Tracer tracer = TikaOpenTelemetry.getTracer();
+ Span span = tracer.spanBuilder("test.operation")
+ .setAttribute("test.attribute", "test.value")
+ .startSpan();
+
+ assertNotNull(span, "Span should be created");
+
+ span.end();
+ }
+ }
+
+ /**
+ * Tests all getter and setter methods on TikaOpenTelemetryConfig
+ * to ensure proper field access and mutation.
+ */
+ @Test
+ public void testGettersAndSetters() {
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+
+ config.setEnabled(true);
+ assertTrue(config.isEnabled());
+
+ config.setServiceName("test-service");
+ assertEquals("test-service", config.getServiceName());
+
+ config.setOtlpEndpoint("http://test:4317");
+ assertEquals("http://test:4317", config.getOtlpEndpoint());
+
+ config.setSamplingProbability(0.5);
+ assertEquals(0.5, config.getSamplingProbability());
+
+ config.setExportTimeoutMillis(60000);
+ assertEquals(60000, config.getExportTimeoutMillis());
+
+ config.setExporterType("console");
+ assertEquals("console", config.getExporterType());
+ }
+
+ /**
+ * Tests the toString() method of TikaOpenTelemetryConfig.
+ * Verifies that the string representation includes key configuration values.
+ */
+ @Test
+ public void testToString() {
+ TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig();
+ config.setEnabled(true);
+ config.setServiceName("my-tika");
+
+ String str = config.toString();
+ assertNotNull(str);
+ assertTrue(str.contains("enabled=true"));
+ assertTrue(str.contains("my-tika"));
+ }
+}
diff --git a/tika-server/tika-server-standard/bin/tika b/tika-server/tika-server-standard/bin/tika
index 90f4b39729..61d989c735 100755
--- a/tika-server/tika-server-standard/bin/tika
+++ b/tika-server/tika-server-standard/bin/tika
@@ -487,7 +487,22 @@ function start_tika() {
exit 1
fi
- TIKA_START_OPTS=('-server' "${JAVA_MEM_OPTS[@]}" "${GC_TUNE[@]}" "${GC_LOG_OPTS[@]}" \
+ # OpenTelemetry Java Agent support
+ # If OTEL_JAVAAGENT_PATH is set, add the -javaagent flag for auto-instrumentation
+ OTEL_AGENT_OPTS=()
+ if [ -n "$OTEL_JAVAAGENT_PATH" ]; then
+ if [ -f "$OTEL_JAVAAGENT_PATH" ]; then
+ OTEL_AGENT_OPTS+=("-javaagent:$OTEL_JAVAAGENT_PATH")
+ if $verbose ; then
+ echo -e " OTEL Java Agent = $OTEL_JAVAAGENT_PATH"
+ fi
+ else
+ echo -e "\nWARNING: OTEL_JAVAAGENT_PATH is set but file not found: $OTEL_JAVAAGENT_PATH"
+ echo -e " Continuing without OpenTelemetry auto-instrumentation.\n"
+ fi
+ fi
+
+ TIKA_START_OPTS=('-server' "${OTEL_AGENT_OPTS[@]}" "${JAVA_MEM_OPTS[@]}" "${GC_TUNE[@]}" "${GC_LOG_OPTS[@]}" \
"${TIKA_HOST_ARG[@]}" \
"${LOG4J_CONFIG[@]}" "${TIKA_OPTS[@]}")
diff --git a/tika-server/tika-server-standard/docker/README.md b/tika-server/tika-server-standard/docker/README.md
new file mode 100644
index 0000000000..e773281e3a
--- /dev/null
+++ b/tika-server/tika-server-standard/docker/README.md
@@ -0,0 +1,146 @@
+# OpenTelemetry Observability Stack for Tika Server
+
+This directory contains Docker Compose configurations for running an observability stack alongside Tika Server.
+
+## Quick Start
+
+### 1. Start Jaeger (Traces)
+
+```bash
+docker-compose -f docker-compose-otel.yml up -d jaeger
+```
+
+This starts Jaeger all-in-one which includes:
+- OTLP gRPC receiver on port 4317
+- OTLP HTTP receiver on port 4318
+- Jaeger UI on port 16686
+
+### 2. Configure Tika Server
+
+Set the OTLP endpoint environment variable:
+
+```bash
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+export OTEL_SERVICE_NAME=my-tika-server
+```
+
+Or enable via XML configuration in `tika-server-config.xml`:
+
+```xml
+
+ true
+ http://localhost:4317
+ my-tika-server
+
+```
+
+### 3. Start Tika Server
+
+```bash
+java -jar tika-server-standard/target/tika-server-standard-*.jar
+```
+
+### 4. Send Test Requests
+
+```bash
+# Parse a document
+curl -T sample.pdf http://localhost:9998/tika
+
+# Detect MIME type
+echo "Hello World" | curl -X PUT --data-binary @- http://localhost:9998/detect/stream
+
+# Extract metadata
+curl -T sample.pdf http://localhost:9998/meta
+```
+
+### 5. View Traces in Jaeger UI
+
+Open your browser to: http://localhost:16686
+
+- Select "my-tika-server" from the Service dropdown
+- Click "Find Traces"
+- Click on a trace to see detailed span information
+
+## Services
+
+### Jaeger
+
+Jaeger provides distributed tracing:
+- **UI**: http://localhost:16686
+- **OTLP gRPC**: localhost:4317
+- **OTLP HTTP**: localhost:4318
+
+### Prometheus (Optional)
+
+To start Prometheus for metrics collection:
+
+```bash
+docker-compose -f docker-compose-otel.yml --profile with-metrics up -d
+```
+
+Access Prometheus UI at: http://localhost:9090
+
+## Stopping Services
+
+```bash
+docker-compose -f docker-compose-otel.yml down
+```
+
+To remove volumes as well:
+
+```bash
+docker-compose -f docker-compose-otel.yml down -v
+```
+
+## Troubleshooting
+
+### Traces not appearing in Jaeger
+
+1. Check Tika Server logs for OpenTelemetry initialization messages
+2. Verify OTEL_EXPORTER_OTLP_ENDPOINT is set correctly
+3. Check Jaeger logs: `docker logs tika-jaeger`
+4. Ensure firewall allows connection to port 4317
+
+### Connection refused errors
+
+Make sure Jaeger is running:
+```bash
+docker ps | grep jaeger
+```
+
+If using Docker on Mac/Windows, use `host.docker.internal` instead of `localhost`:
+```bash
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317
+```
+
+## Advanced Configuration
+
+### Custom Prometheus Configuration
+
+Create a `prometheus.yml` file in this directory:
+
+```yaml
+global:
+ scrape_interval: 15s
+
+scrape_configs:
+ - job_name: 'tika-server'
+ static_configs:
+ - targets: ['host.docker.internal:9998']
+```
+
+Then start with the metrics profile.
+
+### Using with Tika Docker Container
+
+If running Tika Server in Docker, add it to the same network:
+
+```bash
+docker run -d \
+ --name tika-server \
+ --network tika-otel_tika-otel \
+ -e OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4317 \
+ -e OTEL_SERVICE_NAME=tika-server \
+ -p 9998:9998 \
+ apache/tika:latest
+```
diff --git a/tika-server/tika-server-standard/docker/docker-compose-otel.yml b/tika-server/tika-server-standard/docker/docker-compose-otel.yml
new file mode 100644
index 0000000000..ff02b77a08
--- /dev/null
+++ b/tika-server/tika-server-standard/docker/docker-compose-otel.yml
@@ -0,0 +1,47 @@
+# Docker Compose configuration for OpenTelemetry observability stack
+# This sets up Jaeger for trace collection and visualization
+version: '3.8'
+
+services:
+ # Jaeger all-in-one: collector, query service, and UI
+ jaeger:
+ image: jaegertracing/all-in-one:latest
+ container_name: tika-jaeger
+ ports:
+ - "16686:16686" # Jaeger UI
+ - "4317:4317" # OTLP gRPC receiver
+ - "4318:4318" # OTLP HTTP receiver
+ - "14250:14250" # Jaeger gRPC
+ environment:
+ - COLLECTOR_OTLP_ENABLED=true
+ - LOG_LEVEL=info
+ networks:
+ - tika-otel
+ restart: unless-stopped
+
+ # Optional: Prometheus for metrics collection
+ prometheus:
+ image: prom/prometheus:latest
+ container_name: tika-prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.path=/prometheus'
+ - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+ - '--web.console.templates=/usr/share/prometheus/consoles'
+ ports:
+ - "9090:9090"
+ volumes:
+ - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
+ - prometheus-data:/prometheus
+ networks:
+ - tika-otel
+ restart: unless-stopped
+ profiles:
+ - with-metrics
+
+networks:
+ tika-otel:
+ driver: bridge
+
+volumes:
+ prometheus-data:
diff --git a/tika-server/tika-server-standard/otel-agent/README.md b/tika-server/tika-server-standard/otel-agent/README.md
new file mode 100644
index 0000000000..e270d55099
--- /dev/null
+++ b/tika-server/tika-server-standard/otel-agent/README.md
@@ -0,0 +1,216 @@
+# OpenTelemetry Java Agent for Tika Server
+
+This directory contains instructions for using the OpenTelemetry Java Agent for automatic instrumentation of Tika Server.
+
+## What is Auto-Instrumentation?
+
+The OpenTelemetry Java Agent provides automatic instrumentation for many popular Java libraries without requiring code changes. When attached to Tika Server, it automatically captures:
+
+- HTTP request/response traces (via Jetty instrumentation)
+- JDBC database calls (if using database features)
+- Additional framework-level spans
+
+This complements the manual instrumentation already present in Tika Server for Tika-specific operations.
+
+## Download the Agent
+
+### Option 1: Direct Download
+
+Download the latest OpenTelemetry Java Agent:
+
+```bash
+curl -L -O https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar
+```
+
+### Option 2: Maven
+
+Add to your pom.xml and use Maven to download:
+
+```xml
+
+ io.opentelemetry.javaagent
+ opentelemetry-javaagent
+ 2.10.0
+ runtime
+
+```
+
+### Option 3: Specific Version
+
+```bash
+VERSION=2.10.0
+curl -L -O "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v${VERSION}/opentelemetry-javaagent.jar"
+```
+
+## Usage
+
+### Basic Usage
+
+```bash
+java -javaagent:opentelemetry-javaagent.jar \
+ -Dotel.service.name=tika-server \
+ -Dotel.exporter.otlp.endpoint=http://localhost:4317 \
+ -jar tika-server-standard-*.jar
+```
+
+### With Environment Variables
+
+```bash
+export OTEL_SERVICE_NAME=tika-server
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+export OTEL_TRACES_EXPORTER=otlp
+export OTEL_METRICS_EXPORTER=otlp
+export OTEL_LOGS_EXPORTER=otlp
+
+java -javaagent:opentelemetry-javaagent.jar \
+ -jar tika-server-standard-*.jar
+```
+
+### Using the Tika Startup Script
+
+If you place the agent JAR in this directory, you can use an environment variable:
+
+```bash
+export OTEL_JAVAAGENT_PATH=/path/to/otel-agent/opentelemetry-javaagent.jar
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+
+./bin/tika
+```
+
+## Configuration Options
+
+### Common System Properties
+
+| Property | Description | Default |
+|----------|-------------|---------|
+| `otel.service.name` | Service name for traces | `unknown_service:java` |
+| `otel.exporter.otlp.endpoint` | OTLP endpoint URL | `http://localhost:4317` |
+| `otel.traces.sampler` | Sampling strategy | `parentbased_always_on` |
+| `otel.traces.sampler.arg` | Sampler argument (e.g., probability) | - |
+| `otel.instrumentation.http.capture-headers.server.request` | Capture request headers | - |
+| `otel.instrumentation.http.capture-headers.server.response` | Capture response headers | - |
+
+### Sampling Configuration
+
+Sample 10% of traces:
+```bash
+-Dotel.traces.sampler=traceidratio \
+-Dotel.traces.sampler.arg=0.1
+```
+
+### Disable Specific Instrumentations
+
+```bash
+# Disable JDBC instrumentation
+-Dotel.instrumentation.jdbc.enabled=false
+
+# Disable HTTP client instrumentation
+-Dotel.instrumentation.http-url-connection.enabled=false
+```
+
+## What Gets Instrumented Automatically?
+
+With the Java Agent attached, you'll see additional spans for:
+
+### HTTP Server (Jetty/CXF)
+- `HTTP GET /tika`
+- `HTTP PUT /detect/stream`
+- Request/response headers
+- HTTP status codes
+- Request duration
+
+### JDBC (if used)
+- Database queries
+- Connection pool metrics
+- Transaction boundaries
+
+### JVM Metrics
+- Memory usage
+- GC activity
+- Thread counts
+- CPU usage
+
+## Combining Auto and Manual Instrumentation
+
+The Java Agent works seamlessly with Tika's manual instrumentation:
+
+1. **Auto-instrumentation** creates high-level HTTP request spans
+2. **Manual instrumentation** creates detailed Tika-specific spans (parse, detect, metadata)
+3. Spans are automatically nested, showing the complete request flow
+
+Example trace structure:
+```
+HTTP PUT /tika (auto-instrumented)
+ └─ tika.parse (manual)
+ ├─ parser.initialization
+ └─ content.extraction
+```
+
+## Verifying Installation
+
+After starting Tika Server with the agent:
+
+1. Check logs for OpenTelemetry initialization:
+ ```
+ [otel.javaagent] OpenTelemetry Java Agent ...
+ ```
+
+2. Send a test request:
+ ```bash
+ curl -T test.pdf http://localhost:9998/tika
+ ```
+
+3. View traces in Jaeger UI (http://localhost:16686)
+
+4. Look for both auto-instrumented (`HTTP PUT`) and manual (`tika.parse`) spans
+
+## Performance Considerations
+
+The Java Agent adds minimal overhead:
+- Typical overhead: 1-3%
+- Can be reduced with sampling
+- Async span export prevents blocking
+
+To measure impact:
+```bash
+# Without agent
+time java -jar tika-server-standard-*.jar &
+
+# With agent
+time java -javaagent:opentelemetry-javaagent.jar -jar tika-server-standard-*.jar &
+```
+
+## Troubleshooting
+
+### Agent not loading
+
+Ensure the JAR path is correct:
+```bash
+java -javaagent:/full/path/to/opentelemetry-javaagent.jar -jar ...
+```
+
+### No auto-instrumented spans
+
+Check that supported libraries are being used:
+```bash
+-Dotel.javaagent.debug=true
+```
+
+### Conflicts with manual instrumentation
+
+The agent is compatible with manual SDK usage. Spans are automatically correlated.
+
+### Excessive spans
+
+Disable unwanted instrumentations:
+```bash
+-Dotel.instrumentation.common.default-enabled=false \
+-Dotel.instrumentation.jetty.enabled=true \
+-Dotel.instrumentation.http.enabled=true
+```
+
+## Further Reading
+
+- [OpenTelemetry Java Agent Documentation](https://opentelemetry.io/docs/instrumentation/java/automatic/)
+- [Supported Libraries and Frameworks](https://github.com/open-telemetry/opentelemetry-java-instrumentation/blob/main/docs/supported-libraries.md)
+- [Configuration Options](https://opentelemetry.io/docs/instrumentation/java/automatic/agent-config/)