From 3c5797b942991306b05a6eae10dc262c6f6b9fda Mon Sep 17 00:00:00 2001 From: lewismc Date: Thu, 16 Oct 2025 21:51:56 -0700 Subject: [PATCH] TIKA-4513 Instrument tika-server --- .gitignore | 3 +- tika-server/OPENTELEMETRY.md | 658 ++++++++++++++++++ tika-server/tika-server-core/pom.xml | 38 + .../tika/server/core/TikaOpenTelemetry.java | 342 +++++++++ .../server/core/TikaOpenTelemetryConfig.java | 301 ++++++++ .../tika/server/core/TikaServerConfig.java | 10 + .../tika/server/core/TikaServerProcess.java | 8 + .../core/resource/DetectorResource.java | 35 +- .../core/resource/MetadataResource.java | 46 +- .../server/core/resource/TikaResource.java | 42 ++ .../resources/tika-server-config-default.xml | 28 + .../server/core/TikaOpenTelemetryTest.java | 187 +++++ tika-server/tika-server-standard/bin/tika | 17 +- .../tika-server-standard/docker/README.md | 146 ++++ .../docker/docker-compose-otel.yml | 47 ++ .../tika-server-standard/otel-agent/README.md | 216 ++++++ 16 files changed, 2115 insertions(+), 9 deletions(-) create mode 100644 tika-server/OPENTELEMETRY.md create mode 100644 tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java create mode 100644 tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java create mode 100644 tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java create mode 100644 tika-server/tika-server-standard/docker/README.md create mode 100644 tika-server/tika-server-standard/docker/docker-compose-otel.yml create mode 100644 tika-server/tika-server-standard/otel-agent/README.md diff --git a/.gitignore b/.gitignore index 011a1f3a01..c73b59429f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ nb-configuration.xml *.DS_Store *.tmp-inception *.snap -.*.swp \ No newline at end of file +.*.swp +.vscode diff --git a/tika-server/OPENTELEMETRY.md b/tika-server/OPENTELEMETRY.md new file mode 100644 index 0000000000..789a10474c --- /dev/null +++ b/tika-server/OPENTELEMETRY.md @@ -0,0 +1,658 @@ +# OpenTelemetry Instrumentation for Apache Tika Server + +This document describes how to enable and use OpenTelemetry (OTEL) observability in Apache Tika Server for comprehensive monitoring of traces, metrics, and logs. + +## Table of Contents + +1. [Introduction](#introduction) +2. [Prerequisites](#prerequisites) +3. [Quick Start](#quick-start) +4. [Configuration](#configuration) +5. [Auto-Instrumentation Setup](#auto-instrumentation-setup) +6. [Manual Instrumentation](#manual-instrumentation) +7. [Exporters](#exporters) +8. [Docker Integration](#docker-integration) +9. [Verifying Setup](#verifying-setup) +10. [Performance Considerations](#performance-considerations) +11. [Troubleshooting](#troubleshooting) + +## Introduction + +OpenTelemetry provides standardized observability instrumentation for Tika Server, enabling: + +- **Distributed Tracing**: End-to-end request flows from HTTP ingestion to parser execution +- **Metrics**: Throughput, error rates, and resource usage +- **Structured Logs**: Correlated with traces via trace/span IDs + +### Why OpenTelemetry? + +- **Vendor-neutral**: Works with Jaeger, Zipkin, Prometheus, Grafana, and many others +- **Future-proof**: Semantic conventions ensure compatibility with evolving backends +- **Low overhead**: Configurable sampling and async export minimize performance impact +- **Rich ecosystem**: Integrates with modern observability platforms + +## Prerequisites + +- **Java**: 11 or higher +- **Apache Tika Server**: 4.0.0-SNAPSHOT or later +- **OpenTelemetry Collector or Backend**: Jaeger, Zipkin, or OTLP-compatible collector + +## Quick Start + +### 1. Enable OpenTelemetry via Environment Variables + +The simplest way to enable OpenTelemetry is through environment variables: + +```bash +# For manual instrumentation (uses gRPC by default) +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_SERVICE_NAME=my-tika-server + +# For auto-instrumentation with Java agent (uses HTTP by default) +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_SERVICE_NAME=my-tika-server +``` + +**Port Reference:** +- **4317**: OTLP gRPC endpoint (default for manual instrumentation) +- **4318**: OTLP HTTP endpoint (default for Java agent auto-instrumentation) + +Setting `OTEL_EXPORTER_OTLP_ENDPOINT` automatically enables OpenTelemetry. + +### 2. Start a Local Jaeger Instance + +```bash +cd tika-server/tika-server-standard/docker +docker-compose -f docker-compose-otel.yml up -d jaeger +``` + +Jaeger UI will be available at: http://localhost:16686 + +### 3. Start Tika Server + +```bash +java -jar tika-server-standard/target/tika-server-standard-*.jar +``` + +### 4. Send Test Requests + +**Important**: Include the `File-Name` header to properly populate the `tika.resource_name` span attribute: + +```bash +# Parse a document +curl -T mydocument.pdf \ + -H "File-Name: mydocument.pdf" \ + http://localhost:9998/tika + +# Detect MIME type +curl -T sample.txt \ + -H "File-Name: sample.txt" \ + http://localhost:9998/detect/stream + +# Extract metadata +curl -T mydocument.pdf \ + -H "File-Name: mydocument.pdf" \ + http://localhost:9998/meta + +# Alternative: Use Content-Disposition header (standard HTTP) +curl -T document.docx \ + -H "Content-Disposition: attachment; filename=document.docx" \ + http://localhost:9998/tika + +# Or use multipart form upload (filename included automatically) +curl -F "file=@mydocument.pdf" http://localhost:9998/tika/form +``` + +### 5. View Traces + +Open http://localhost:16686, select "my-tika-server" from the service dropdown, and click "Find Traces". + +## Configuration + +### Environment Variables + +OpenTelemetry can be configured entirely through environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `OTEL_SDK_DISABLED` | Disable OpenTelemetry completely | `false` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP endpoint URL | `http://localhost:4317` (gRPC) or `http://localhost:4318` (HTTP) | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | Protocol to use | `grpc` (manual) or `http/protobuf` (agent) | +| `OTEL_SERVICE_NAME` | Service name for identification | `tika-server` | +| `OTEL_TRACES_SAMPLER` | Sampling strategy | `parentbased_always_on` | +| `OTEL_TRACES_SAMPLER_ARG` | Sampling probability (0.0-1.0) | `1.0` | + +**Example:** + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=https://my-collector.example.com:4317 +export OTEL_SERVICE_NAME=production-tika-server +export OTEL_TRACES_SAMPLER=traceidratio +export OTEL_TRACES_SAMPLER_ARG=0.1 # Sample 10% of traces +``` + +### XML Configuration + +Alternatively, configure via `tika-server-config.xml`: + +```xml + + + + 9998 + + + + true + otlp + http://localhost:4317 + tika-server + 1.0 + 30000 + + + +``` + +**Note:** Environment variables take precedence over XML configuration. + +### Configuration Options + +- **enabled**: Enable/disable OpenTelemetry (`true`/`false`) +- **exporterType**: Currently only `otlp` is supported +- **otlpEndpoint**: OTLP gRPC endpoint URL +- **serviceName**: Identifier for this Tika Server instance +- **samplingProbability**: Fraction of traces to sample (0.0 to 1.0) +- **exportTimeoutMillis**: Timeout for exporting telemetry data + +## Auto-Instrumentation Setup + +Auto-instrumentation provides automatic tracing for HTTP requests and other framework-level operations. + +### Download the OpenTelemetry Java Agent + +```bash +cd tika-server/tika-server-standard/otel-agent +curl -L -O https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar +``` + +### Run Tika Server with Auto-Instrumentation + +```bash +# Using HTTP protocol (default for Java agent) - port 4318 +java -javaagent:otel-agent/opentelemetry-javaagent.jar \ + -Dotel.service.name=tika-server \ + -Dotel.exporter.otlp.endpoint=http://localhost:4318 \ + -jar tika-server-standard/target/tika-server-standard-*.jar + +# Or using gRPC protocol - port 4317 +java -javaagent:otel-agent/opentelemetry-javaagent.jar \ + -Dotel.service.name=tika-server \ + -Dotel.exporter.otlp.endpoint=http://localhost:4317 \ + -Dotel.exporter.otlp.protocol=grpc \ + -jar tika-server-standard/target/tika-server-standard-*.jar +``` + +**Important**: The Java agent defaults to HTTP protocol (port 4318). If using gRPC (port 4317), you must specify `-Dotel.exporter.otlp.protocol=grpc`. + +### What Gets Instrumented Automatically? + +- HTTP server requests (Jetty/CXF) +- HTTP client calls (if Tika makes outbound HTTP requests) +- JDBC database operations (if using database features) +- JVM metrics (memory, GC, threads, CPU) + +See [otel-agent/README.md](tika-server-standard/otel-agent/README.md) for more details. + +## Manual Instrumentation + +Tika Server includes manual instrumentation for Tika-specific operations: + +### Instrumented Endpoints + +| Endpoint | Span Name | Attributes | +|----------|-----------|------------| +| `/tika` | `tika.parse` | `tika.resource_name`, `tika.content_type`, `tika.endpoint` | +| `/detect` | `tika.detect` | `tika.resource_name`, `tika.detected_type`, `tika.endpoint` | +| `/meta` | `tika.metadata.extract` | `tika.resource_name`, `tika.metadata_count`, `tika.endpoint` | + +### Span Attributes + +- **tika.resource_name**: Filename or resource being processed + - Extracted from `File-Name` request header + - Or from `Content-Disposition: attachment; filename=...` header + - Automatically populated when using multipart form uploads + - Displays as "unknown" if no filename header is provided +- **tika.content_type**: Detected MIME type +- **tika.detected_type**: Result of MIME detection +- **tika.metadata_count**: Number of metadata fields extracted +- **tika.endpoint**: API endpoint invoked + +### Error Handling + +Exceptions are automatically recorded in spans with: +- Exception type and message +- Stack traces +- Span status set to ERROR + +## Exporters + +### OTLP (Recommended) + +OTLP (OpenTelemetry Protocol) is the native protocol and recommended exporter. OTLP supports two transport protocols: + +#### gRPC (Port 4317) +- **Used by**: Manual instrumentation (default in our code) +- **Protocol**: Binary, efficient +- **Configuration:** + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +# Protocol is gRPC by default for manual instrumentation +``` + +#### HTTP/Protobuf (Port 4318) +- **Used by**: Java agent auto-instrumentation (default) +- **Protocol**: HTTP with protobuf encoding +- **Configuration:** + +```bash +# For Java agent (HTTP is default) +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 + +# Or explicitly specify protocol +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf +``` + +**Supported Backends:** +- OpenTelemetry Collector +- Jaeger (v1.35+) +- Grafana Tempo +- Grafana Cloud +- Honeycomb +- Lightstep +- New Relic +- Many others + +### Jaeger + +Direct export to Jaeger using OTLP: + +```bash +docker run -d --name jaeger \ + -p 16686:16686 \ + -p 4317:4317 \ + jaegertracing/all-in-one:latest +``` + +Access Jaeger UI: http://localhost:16686 + +### Grafana Cloud + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-central-0.grafana.net/otlp +export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic " +``` + +### Console Exporter (Development) + +For development/debugging, use the console exporter: + +```java +// In TikaOpenTelemetry.java, replace OtlpGrpcSpanExporter with: +LoggingSpanExporter spanExporter = LoggingSpanExporter.create(); +``` + +## Docker Integration + +### Using Docker Compose + +Start Tika Server with observability stack: + +```bash +cd tika-server/tika-server-standard/docker +docker-compose -f docker-compose-otel.yml up -d +``` + +This starts: +- Jaeger (traces + UI) +- Optionally: Prometheus (metrics) + +### Running Tika Server in Docker + +```bash +docker run -d \ + --name tika-server \ + --network tika-otel_tika-otel \ + -e OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4317 \ + -e OTEL_SERVICE_NAME=tika-server \ + -p 9998:9998 \ + apache/tika:latest +``` + +### Kubernetes / Helm + +For production Kubernetes deployments, use the OpenTelemetry Operator: + +```yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: tika-instrumentation +spec: + exporter: + endpoint: http://otel-collector:4317 + propagators: + - tracecontext + - baggage + sampler: + type: parentbased_traceidratio + argument: "0.25" +``` + +Apply to Tika Server deployment with annotation: +```yaml +metadata: + annotations: + instrumentation.opentelemetry.io/inject-java: "true" +``` + +See the Tika Helm chart documentation for more details. + +## Verifying Setup + +### 1. Check Tika Server Logs + +Look for initialization messages: + +``` +INFO [main] o.a.t.s.c.TikaOpenTelemetry - Initializing OpenTelemetry: TikaOpenTelemetryConfig{enabled=true, ...} +INFO [main] o.a.t.s.c.TikaOpenTelemetry - OpenTelemetry initialized successfully +``` + +### 2. Send Test Requests + +**Important**: Always include filename headers to populate the `tika.resource_name` attribute in traces. + +```bash +# Create a test file +echo "Hello OpenTelemetry" > test.txt + +# Parse with Tika (include File-Name header) +curl -T test.txt \ + -H "File-Name: test.txt" \ + http://localhost:9998/tika + +# Detect MIME type +curl -T test.txt \ + -H "File-Name: test.txt" \ + http://localhost:9998/detect/stream + +# Extract metadata +curl -T test.txt \ + -H "File-Name: test.txt" \ + http://localhost:9998/meta + +# Alternative methods to include filename: + +# Method 1: Content-Disposition header (HTTP standard) +curl -T test.txt \ + -H "Content-Disposition: attachment; filename=test.txt" \ + http://localhost:9998/tika + +# Method 2: Multipart form upload (filename automatic) +curl -F "file=@test.txt" http://localhost:9998/tika/form +``` + +**Why include filename headers?** +Without the filename header, the `tika.resource_name` span attribute will show as "unknown", making it harder to identify which document was processed in traces. + +### 3. View in Jaeger + +1. Open http://localhost:16686 +2. Service: Select your service name (e.g., "tika-server") +3. Click "Find Traces" +4. Click on a trace to see detailed spans + +### Expected Span Structure + +**With Auto-Instrumentation (Java agent):** +``` +HTTP PUT /tika + └─ tika.parse + Attributes: + - tika.resource_name: test.txt ← From File-Name header + - tika.content_type: text/plain ← Auto-detected by Tika + - tika.endpoint: /tika + - span.status: OK +``` + +**Manual Instrumentation Only:** +``` +tika.parse + Attributes: + - tika.resource_name: mydocument.pdf + - tika.content_type: application/pdf + - tika.endpoint: /tika +``` + +**Note**: If the filename header is missing, `tika.resource_name` will show as "unknown". + +## Performance Considerations + +### Overhead + +OpenTelemetry adds minimal overhead when properly configured: +- **Disabled**: No overhead +- **Enabled with sampling**: 1-3% typical overhead +- **Enabled without sampling**: 3-5% worst-case overhead + +### Sampling Strategies + +**Always On** (Default): +```bash +export OTEL_TRACES_SAMPLER=always_on +``` +Captures every trace. Good for development and low-traffic services. + +**Probability-Based**: +```bash +export OTEL_TRACES_SAMPLER=traceidratio +export OTEL_TRACES_SAMPLER_ARG=0.1 # 10% sampling +``` +Samples a percentage of traces. Reduces overhead and storage costs. + +**Parent-Based** (Recommended): +```bash +export OTEL_TRACES_SAMPLER=parentbased_traceidratio +export OTEL_TRACES_SAMPLER_ARG=0.1 +``` +Respects parent trace sampling decisions (for distributed tracing). + +### Async Export + +Telemetry data is exported asynchronously in batches, preventing blocking of request processing. + +### Resource Limits + +The OpenTelemetry SDK uses bounded queues to prevent memory issues: +- Default queue size: 2048 spans +- Spans are dropped if queue is full (counted in metrics) + +## Troubleshooting + +### Traces Not Appearing + +**Problem**: No traces visible in Jaeger/backend. + +**Solutions**: + +1. **Check OpenTelemetry is enabled:** + ```bash + grep "OpenTelemetry" tika-server.log + ``` + Should see "OpenTelemetry initialized successfully". + +2. **Verify endpoint is reachable:** + ```bash + telnet localhost 4317 + ``` + +3. **Check for errors in Tika logs:** + ```bash + grep "ERROR.*OpenTelemetry" tika-server.log + ``` + +4. **Verify backend is running:** + ```bash + docker ps | grep jaeger + ``` + +### Connection Refused + +**Problem**: `Connection refused` to OTLP endpoint. + +**Solutions**: + +1. **Start Jaeger/collector:** + ```bash + docker-compose -f docker/docker-compose-otel.yml up -d jaeger + ``` + +2. **Verify correct port for your protocol:** + - **Manual instrumentation (gRPC)**: Use port `4317` + - **Auto-instrumentation (HTTP)**: Use port `4318` + + ```bash + # Manual: + export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 + + # Agent: + export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 + ``` + +3. **Check firewall rules:** + Ensure ports 4317 (gRPC) and 4318 (HTTP) are not blocked. + +4. **Use correct hostname:** + - Local: `http://localhost:4317` or `http://localhost:4318` + - Docker: `http://jaeger:4317` or `http://jaeger:4318` + - Docker Desktop: `http://host.docker.internal:4317` or `http://host.docker.internal:4318` + +### Wrong Port/Protocol + +**Problem**: Warning in logs: "OTLP exporter endpoint port is likely incorrect for protocol version..." + +**Cause**: Port and protocol mismatch. + +**Solution**: Match the port to the protocol: + +```bash +# If using gRPC (manual instrumentation): +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_EXPORTER_OTLP_PROTOCOL=grpc + +# If using HTTP (Java agent default): +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +# HTTP is the default for agent, no need to specify protocol +``` + +**Quick Reference:** +- gRPC → Port 4317 +- HTTP → Port 4318 + +### High Overhead + +**Problem**: Tika Server performance degraded after enabling OTEL. + +**Solutions**: + +1. **Enable sampling:** + ```bash + export OTEL_TRACES_SAMPLER=traceidratio + export OTEL_TRACES_SAMPLER_ARG=0.1 + ``` + +2. **Disable auto-instrumentation:** + Remove `-javaagent` flag if only manual instrumentation is needed. + +3. **Increase export batch size:** + Reduces export frequency (in code: `BatchSpanProcessor.builder().setMaxExportBatchSize(512)`). + +### Spans Missing Attributes + +**Problem**: Spans don't show expected attributes (e.g., `tika.resource_name` shows "unknown"). + +**Causes**: +- **Missing filename header**: The `File-Name` or `Content-Disposition` header was not included in the request +- **Attributes are null or not set** + +**Solutions**: + +1. **Include filename in requests:** + ```bash + # Add File-Name header + curl -T document.pdf -H "File-Name: document.pdf" http://localhost:9998/tika + + # Or use Content-Disposition + curl -T document.pdf -H "Content-Disposition: attachment; filename=document.pdf" http://localhost:9998/tika + + # Or use multipart form + curl -F "file=@document.pdf" http://localhost:9998/tika/form + ``` + +2. **Check Tika Server logs for warnings:** + ```bash + grep "WARN" tika-server.log | grep -i metadata + ``` + +### Duplicate Spans + +**Problem**: Seeing duplicate spans for the same operation. + +**Cause**: Both auto and manual instrumentation creating spans. + +**Solution**: This is expected. Auto-instrumentation creates HTTP-level spans, manual creates Tika-specific spans. They should be nested, not duplicated. + +## Further Reading + +- [OpenTelemetry Official Documentation](https://opentelemetry.io/docs/) +- [OpenTelemetry Java SDK](https://opentelemetry.io/docs/instrumentation/java/) +- [OTLP Specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/otlp.md) +- [Semantic Conventions](https://github.com/open-telemetry/semantic-conventions) +- [Jaeger Documentation](https://www.jaegertracing.io/docs/) +- [Tika Wiki](https://cwiki.apache.org/confluence/display/TIKA) + +## Contributing + +To add more instrumentation to Tika Server: + +1. Import OpenTelemetry API: + ```java + import io.opentelemetry.api.trace.Span; + import io.opentelemetry.api.trace.Tracer; + ``` + +2. Get tracer instance: + ```java + Tracer tracer = TikaOpenTelemetry.getTracer(); + ``` + +3. Create spans: + ```java + Span span = tracer.spanBuilder("operation.name") + .setAttribute("key", "value") + .startSpan(); + try { + // Your code + span.setStatus(StatusCode.OK); + } catch (Exception e) { + span.recordException(e); + span.setStatus(StatusCode.ERROR); + } finally { + span.end(); + } + ``` + +See existing instrumentation in `TikaResource.java`, `DetectorResource.java`, and `MetadataResource.java` for examples. diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index 8dc3c5a825..18af603eb2 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -119,6 +119,44 @@ org.apache.logging.log4j log4j-slf4j2-impl + + + + io.opentelemetry + opentelemetry-api + 1.55.0 + true + + + io.opentelemetry + opentelemetry-sdk + 1.55.0 + true + + + io.opentelemetry + opentelemetry-exporter-otlp + 1.55.0 + true + + + io.opentelemetry.instrumentation + opentelemetry-instrumentation-annotations + 2.20.1 + true + + + io.opentelemetry.instrumentation + opentelemetry-log4j-appender-2.17 + 2.20.1-alpha + true + + + io.opentelemetry + opentelemetry-sdk-testing + 1.55.0 + test + diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java new file mode 100644 index 0000000000..b69e70cec3 --- /dev/null +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetry.java @@ -0,0 +1,342 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.core; + +import java.time.Duration; +import java.util.concurrent.TimeUnit; + +import io.opentelemetry.api.GlobalOpenTelemetry; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.Meter; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.exporter.otlp.metrics.OtlpGrpcMetricExporter; +import io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter; +import io.opentelemetry.sdk.OpenTelemetrySdk; +import io.opentelemetry.sdk.metrics.SdkMeterProvider; +import io.opentelemetry.sdk.metrics.export.PeriodicMetricReader; +import io.opentelemetry.sdk.resources.Resource; +import io.opentelemetry.sdk.trace.SdkTracerProvider; +import io.opentelemetry.sdk.trace.export.BatchSpanProcessor; +import io.opentelemetry.sdk.trace.samplers.Sampler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Singleton class for managing OpenTelemetry instrumentation in Tika Server. + *

+ * This class provides centralized management of the OpenTelemetry SDK lifecycle, + * including initialization, configuration, and graceful shutdown. It exposes + * {@link Tracer} and {@link Meter} instances for creating custom spans and metrics + * throughout the Tika Server codebase. + *

+ *

+ * The class automatically detects when the OpenTelemetry Java agent is present + * and defers to the agent's configuration in that case, enabling both manual + * and auto-instrumentation to coexist harmoniously. + *

+ *

+ * Usage example: + *

{@code
+ * // Initialize during server startup
+ * TikaOpenTelemetry.initialize(config);
+ *
+ * // Create custom spans
+ * Tracer tracer = TikaOpenTelemetry.getTracer();
+ * Span span = tracer.spanBuilder("custom.operation").startSpan();
+ * try {
+ *     // Your code here
+ *     span.setStatus(StatusCode.OK);
+ * } finally {
+ *     span.end();
+ * }
+ *
+ * // Shutdown during server teardown
+ * TikaOpenTelemetry.shutdown();
+ * }
+ *

+ * + * @since 4.0.0 + */ +public class TikaOpenTelemetry { + + private static final Logger LOG = LoggerFactory.getLogger(TikaOpenTelemetry.class); + + /** Instrumentation library name for Tika Server */ + private static final String INSTRUMENTATION_NAME = "org.apache.tika.server"; + + /** Instrumentation library version */ + private static final String INSTRUMENTATION_VERSION = "1.0.0"; + + private static volatile OpenTelemetry openTelemetry = null; + private static volatile Tracer tracer = null; + private static volatile Meter meter = null; + private static volatile boolean initialized = false; + private static volatile boolean enabled = false; + private static volatile SdkTracerProvider tracerProvider = null; + private static volatile SdkMeterProvider meterProvider = null; + + /** + * Initializes OpenTelemetry with the provided configuration. + *

+ * This method is thread-safe and will only initialize once. Subsequent calls + * will log a warning and return immediately. + *

+ *

+ * The initialization process: + *

    + *
  1. Checks if OpenTelemetry Java agent is already active (auto-instrumentation)
  2. + *
  3. If agent present, uses the agent's global instance
  4. + *
  5. Otherwise, creates and registers a new OpenTelemetry SDK instance
  6. + *
  7. Configures OTLP exporters for traces and metrics
  8. + *
  9. Sets up resource attributes (service name, version)
  10. + *
  11. Configures sampling based on provided probability
  12. + *
+ *

+ *

+ * If configuration has {@code enabled=false}, OpenTelemetry remains disabled + * and noop implementations are returned by {@link #getTracer()} and {@link #getMeter()}. + *

+ * + * @param config the OpenTelemetry configuration settings + */ + public static synchronized void initialize(TikaOpenTelemetryConfig config) { + if (initialized) { + LOG.warn("OpenTelemetry already initialized, skipping"); + return; + } + + if (!config.isEnabled()) { + LOG.info("OpenTelemetry is disabled"); + initialized = true; + enabled = false; + return; + } + + try { + LOG.info("Initializing OpenTelemetry: {}", config); + + Resource resource = Resource.getDefault().merge(Resource.create( + Attributes.builder() + .put(AttributeKey.stringKey("service.name"), config.getServiceName()) + .put(AttributeKey.stringKey("service.version"), INSTRUMENTATION_VERSION) + .build())); + + // Configure tracer provider + OtlpGrpcSpanExporter spanExporter = OtlpGrpcSpanExporter.builder() + .setEndpoint(config.getOtlpEndpoint()) + .setTimeout(Duration.ofMillis(config.getExportTimeoutMillis())) + .build(); + + Sampler sampler = Sampler.traceIdRatioBased(config.getSamplingProbability()); + + tracerProvider = SdkTracerProvider.builder() + .addSpanProcessor(BatchSpanProcessor.builder(spanExporter).build()) + .setResource(resource) + .setSampler(sampler) + .build(); + + // Configure meter provider + OtlpGrpcMetricExporter metricExporter = OtlpGrpcMetricExporter.builder() + .setEndpoint(config.getOtlpEndpoint()) + .setTimeout(Duration.ofMillis(config.getExportTimeoutMillis())) + .build(); + + meterProvider = SdkMeterProvider.builder() + .registerMetricReader( + PeriodicMetricReader.builder(metricExporter) + .setInterval(Duration.ofSeconds(60)) + .build()) + .setResource(resource) + .build(); + + // Build and register OpenTelemetry SDK globally + // This may fail if the Java agent has already set the global instance + try { + openTelemetry = OpenTelemetrySdk.builder() + .setTracerProvider(tracerProvider) + .setMeterProvider(meterProvider) + .buildAndRegisterGlobal(); + + tracer = openTelemetry.getTracer(INSTRUMENTATION_NAME, INSTRUMENTATION_VERSION); + meter = openTelemetry.getMeter(INSTRUMENTATION_NAME); + + LOG.info("OpenTelemetry initialized successfully"); + } catch (IllegalStateException e) { + // Java agent has already set the global OpenTelemetry + LOG.info("OpenTelemetry Java agent detected. Using agent's configuration (service.name from -Dotel.service.name)"); + + // Use the agent's global instance + OpenTelemetry agentOtel = GlobalOpenTelemetry.get(); + tracer = agentOtel.getTracer(INSTRUMENTATION_NAME, INSTRUMENTATION_VERSION); + meter = agentOtel.getMeter(INSTRUMENTATION_NAME); + + // Clean up our providers since we're using the agent's + if (tracerProvider != null) { + tracerProvider.close(); + } + if (meterProvider != null) { + meterProvider.close(); + } + tracerProvider = null; + meterProvider = null; + openTelemetry = agentOtel; + } + + initialized = true; + enabled = true; + + } catch (Exception e) { + LOG.error("Failed to initialize OpenTelemetry", e); + initialized = true; + enabled = false; + } + } + + /** + * Returns the OpenTelemetry tracer instance for creating custom spans. + *

+ * The tracer can be used to instrument custom operations and add + * application-specific spans to traces. If OpenTelemetry is not enabled + * or not initialized, returns a noop tracer that performs no operations. + *

+ *

+ * Example usage: + *

{@code
+     * Tracer tracer = TikaOpenTelemetry.getTracer();
+     * Span span = tracer.spanBuilder("custom.operation")
+     *     .setAttribute("key", "value")
+     *     .startSpan();
+     * try (Scope scope = span.makeCurrent()) {
+     *     // Your instrumented code
+     *     span.setStatus(StatusCode.OK);
+     * } finally {
+     *     span.end();
+     * }
+     * }
+ *

+ * + * @return Tracer instance for creating spans, never null + */ + public static Tracer getTracer() { + if (!enabled || tracer == null) { + return OpenTelemetry.noop().getTracer(INSTRUMENTATION_NAME); + } + return tracer; + } + + /** + * Returns the OpenTelemetry meter instance for recording metrics. + *

+ * The meter can be used to create counters, gauges, and histograms for + * recording custom metrics. If OpenTelemetry is not enabled or not + * initialized, returns a noop meter that performs no operations. + *

+ *

+ * Example usage: + *

{@code
+     * Meter meter = TikaOpenTelemetry.getMeter();
+     * LongCounter counter = meter.counterBuilder("parse.count")
+     *     .setDescription("Number of documents parsed")
+     *     .build();
+     * counter.add(1, Attributes.builder()
+     *     .put("content_type", "application/pdf")
+     *     .build());
+     * }
+ *

+ * + * @return Meter instance for recording metrics, never null + */ + public static Meter getMeter() { + if (!enabled || meter == null) { + return OpenTelemetry.noop().getMeter(INSTRUMENTATION_NAME); + } + return meter; + } + + /** + * Checks if OpenTelemetry instrumentation is enabled and active. + *

+ * This method can be used to conditionally create spans or metrics only + * when OpenTelemetry is active, though it's generally not necessary as + * {@link #getTracer()} and {@link #getMeter()} return noop implementations + * when disabled. + *

+ * + * @return true if OpenTelemetry is enabled and initialized, false otherwise + */ + public static boolean isEnabled() { + return enabled; + } + + /** + * Gracefully shuts down the OpenTelemetry SDK and flushes any pending telemetry data. + *

+ * This method should be called during application shutdown to ensure all + * traces and metrics are properly exported before the process terminates. + * The shutdown process: + *

    + *
  1. Stops accepting new spans and metrics
  2. + *
  3. Flushes all pending data to the configured exporter
  4. + *
  5. Waits up to 10 seconds for export to complete
  6. + *
  7. Releases all resources
  8. + *
+ *

+ *

+ * This method is thread-safe and idempotent. If OpenTelemetry is not initialized + * or already shutdown, this method returns immediately without error. + *

+ *

+ * Note: If the OpenTelemetry Java agent is in use, this method + * will not shut down the agent's SDK, only the manually created providers (if any). + *

+ */ + public static synchronized void shutdown() { + if (!initialized || !enabled) { + return; + } + + LOG.info("Shutting down OpenTelemetry"); + + try { + if (tracerProvider != null) { + tracerProvider.shutdown().join(10, TimeUnit.SECONDS); + } + if (meterProvider != null) { + meterProvider.shutdown().join(10, TimeUnit.SECONDS); + } + LOG.info("OpenTelemetry shut down successfully"); + } catch (Exception e) { + LOG.error("Error shutting down OpenTelemetry", e); + } finally { + enabled = false; + } + } + + /** + * Get the OpenTelemetry instance. + * + * @return OpenTelemetry instance + */ + public static OpenTelemetry getOpenTelemetry() { + if (!enabled || openTelemetry == null) { + return OpenTelemetry.noop(); + } + return openTelemetry; + } +} diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java new file mode 100644 index 0000000000..2cc2af0a2b --- /dev/null +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaOpenTelemetryConfig.java @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.core; + +import java.util.Map; + +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.utils.StringUtils; + +/** + * Configuration for OpenTelemetry instrumentation in Tika Server. + *

+ * This class encapsulates all OpenTelemetry-related configuration settings, + * including enabling/disabling instrumentation, OTLP endpoint configuration, + * service identification, and sampling policies. + *

+ *

+ * Configuration can be loaded from: + *

    + *
  • Environment variables (takes precedence)
  • + *
  • XML configuration via TikaServerConfig
  • + *
+ *

+ *

+ * Example XML configuration: + *

{@code
+ * 
+ *   true
+ *   otlp
+ *   http://localhost:4317
+ *   tika-server
+ *   1.0
+ *   30000
+ * 
+ * }
+ *

+ * + * @since 4.0.0 + */ +public class TikaOpenTelemetryConfig implements Initializable { + + /** Environment variable to disable the OpenTelemetry SDK */ + public static final String OTEL_SDK_DISABLED_ENV = "OTEL_SDK_DISABLED"; + + /** Environment variable for the OTLP exporter endpoint URL */ + public static final String OTEL_EXPORTER_OTLP_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_ENDPOINT"; + + /** Environment variable for the service name identifier */ + public static final String OTEL_SERVICE_NAME_ENV = "OTEL_SERVICE_NAME"; + + /** Environment variable for the trace sampling strategy */ + public static final String OTEL_TRACES_SAMPLER_ENV = "OTEL_TRACES_SAMPLER"; + + /** Environment variable for the trace sampling probability */ + public static final String OTEL_TRACES_SAMPLER_ARG_ENV = "OTEL_TRACES_SAMPLER_ARG"; + + private static final String DEFAULT_SERVICE_NAME = "tika-server"; + private static final String DEFAULT_OTLP_ENDPOINT = "http://localhost:4317"; + private static final double DEFAULT_SAMPLING_PROBABILITY = 1.0; + + private boolean enabled = false; + private String exporterType = "otlp"; + private String otlpEndpoint = DEFAULT_OTLP_ENDPOINT; + private String serviceName = DEFAULT_SERVICE_NAME; + private double samplingProbability = DEFAULT_SAMPLING_PROBABILITY; + private int exportTimeoutMillis = 30000; + + /** + * Creates a new OpenTelemetry configuration instance. + * Automatically loads configuration from environment variables. + */ + public TikaOpenTelemetryConfig() { + loadFromEnvironment(); + } + + /** + * Loads configuration from standard OpenTelemetry environment variables. + * This method is called automatically during construction. + *

+ * Recognized environment variables: + *

    + *
  • {@code OTEL_SDK_DISABLED}: Set to "true" to disable OpenTelemetry
  • + *
  • {@code OTEL_EXPORTER_OTLP_ENDPOINT}: OTLP endpoint URL (enables OpenTelemetry if set)
  • + *
  • {@code OTEL_SERVICE_NAME}: Service name for identifying this Tika instance
  • + *
  • {@code OTEL_TRACES_SAMPLER_ARG}: Sampling probability (0.0-1.0)
  • + *
+ *

+ */ + private void loadFromEnvironment() { + String sdkDisabled = System.getenv(OTEL_SDK_DISABLED_ENV); + if ("true".equalsIgnoreCase(sdkDisabled)) { + this.enabled = false; + } + + String endpoint = System.getenv(OTEL_EXPORTER_OTLP_ENDPOINT_ENV); + if (!StringUtils.isBlank(endpoint)) { + this.otlpEndpoint = endpoint; + if (sdkDisabled == null) { + this.enabled = true; + } + } + + String serviceName = System.getenv(OTEL_SERVICE_NAME_ENV); + if (!StringUtils.isBlank(serviceName)) { + this.serviceName = serviceName; + } + + String samplerArg = System.getenv(OTEL_TRACES_SAMPLER_ARG_ENV); + if (!StringUtils.isBlank(samplerArg)) { + try { + this.samplingProbability = Double.parseDouble(samplerArg); + } catch (NumberFormatException e) { + // Keep default + } + } + } + + /** + * Returns whether OpenTelemetry instrumentation is enabled. + * + * @return true if OpenTelemetry is enabled, false otherwise + */ + public boolean isEnabled() { + return enabled; + } + + /** + * Sets whether OpenTelemetry instrumentation is enabled. + * + * @param enabled true to enable OpenTelemetry, false to disable + */ + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + + /** + * Returns the type of exporter to use for telemetry data. + * + * @return the exporter type (currently only "otlp" is supported) + */ + public String getExporterType() { + return exporterType; + } + + /** + * Sets the type of exporter to use for telemetry data. + * + * @param exporterType the exporter type (currently only "otlp" is supported) + */ + public void setExporterType(String exporterType) { + this.exporterType = exporterType; + } + + /** + * Returns the OTLP endpoint URL for exporting telemetry data. + * + * @return the OTLP endpoint URL (e.g., "http://localhost:4317") + */ + public String getOtlpEndpoint() { + return otlpEndpoint; + } + + /** + * Sets the OTLP endpoint URL for exporting telemetry data. + *

+ * Port reference: + *

    + *
  • 4317: gRPC protocol (default for manual instrumentation)
  • + *
  • 4318: HTTP protocol (default for Java agent)
  • + *
+ *

+ * + * @param otlpEndpoint the OTLP endpoint URL + */ + public void setOtlpEndpoint(String otlpEndpoint) { + this.otlpEndpoint = otlpEndpoint; + } + + /** + * Returns the service name used to identify this Tika Server instance. + * + * @return the service name + */ + public String getServiceName() { + return serviceName; + } + + /** + * Sets the service name used to identify this Tika Server instance. + * This name appears in traces and helps identify different Tika deployments. + * + * @param serviceName the service name + */ + public void setServiceName(String serviceName) { + this.serviceName = serviceName; + } + + /** + * Returns the trace sampling probability. + * + * @return the sampling probability (0.0 = sample nothing, 1.0 = sample everything) + */ + public double getSamplingProbability() { + return samplingProbability; + } + + /** + * Sets the trace sampling probability. + * This determines what fraction of traces are sampled and exported. + * + * @param samplingProbability the sampling probability (must be between 0.0 and 1.0) + */ + public void setSamplingProbability(double samplingProbability) { + this.samplingProbability = samplingProbability; + } + + /** + * Returns the timeout in milliseconds for exporting telemetry data. + * + * @return the export timeout in milliseconds + */ + public int getExportTimeoutMillis() { + return exportTimeoutMillis; + } + + /** + * Sets the timeout in milliseconds for exporting telemetry data. + * + * @param exportTimeoutMillis the export timeout in milliseconds + */ + public void setExportTimeoutMillis(int exportTimeoutMillis) { + this.exportTimeoutMillis = exportTimeoutMillis; + } + + @Override + public String toString() { + return "TikaOpenTelemetryConfig{enabled=" + enabled + + ", exporterType=" + exporterType + + ", otlpEndpoint=" + otlpEndpoint + + ", serviceName=" + serviceName + + ", samplingProbability=" + samplingProbability + + ", exportTimeoutMillis=" + exportTimeoutMillis + "}"; + } + + /** + * Initializes the configuration after all fields have been set. + * This method is called by the Tika configuration framework after + * all setter methods have been invoked. + *

+ * For this configuration class, no additional initialization is required + * as all fields are directly set via setters. + *

+ * + * @param params configuration parameters (not used in this implementation) + * @throws TikaConfigException if initialization fails + */ + @Override + public void initialize(Map params) throws TikaConfigException { + // All fields are set via setters by ConfigBase before this is called + // No additional initialization needed + } + + /** + * Validates the configuration settings. + * This method is called by the Tika configuration framework to ensure + * all configuration values are valid and consistent. + * + * @param problemHandler handler for reporting configuration problems + * @throws TikaConfigException if the configuration is invalid + */ + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) + throws TikaConfigException { + // Validate configuration if needed + if (enabled && StringUtils.isBlank(otlpEndpoint)) { + throw new TikaConfigException( + "OpenTelemetry is enabled but otlpEndpoint is not configured"); + } + if (samplingProbability < 0.0 || samplingProbability > 1.0) { + throw new TikaConfigException( + "samplingProbability must be between 0.0 and 1.0, got: " + samplingProbability); + } + } +} diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java index 663e95bebf..1b8217d705 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerConfig.java @@ -132,6 +132,8 @@ public class TikaServerConfig extends ConfigBase { private TlsConfig tlsConfig = new TlsConfig(); + private TikaOpenTelemetryConfig openTelemetry = new TikaOpenTelemetryConfig(); + /** * Config with only the defaults */ @@ -568,6 +570,14 @@ public void setTlsConfig(TlsConfig tlsConfig) { this.tlsConfig = tlsConfig; } + public TikaOpenTelemetryConfig getOpenTelemetry() { + return openTelemetry; + } + + public void setOpenTelemetry(TikaOpenTelemetryConfig openTelemetry) { + this.openTelemetry = openTelemetry; + } + public List getEndpoints() { return endpoints; } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java index e1afe24918..15fd6a0d56 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java @@ -188,6 +188,14 @@ private static ServerDetails initServer(TikaServerConfig tikaServerConfig) throw tika = TikaConfig.getDefaultConfig(); } + // Initialize OpenTelemetry if enabled + TikaOpenTelemetry.initialize(tikaServerConfig.getOpenTelemetry()); + // Register shutdown hook to gracefully shutdown OpenTelemetry + Runtime.getRuntime().addShutdownHook(new Thread(() -> { + LOG.info("Shutting down OpenTelemetry..."); + TikaOpenTelemetry.shutdown(); + })); + DigestingParser.Digester digester = null; if (!StringUtils.isBlank(tikaServerConfig.getDigest())) { try { diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java index 51e7ab3d63..ccb786cd1c 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/DetectorResource.java @@ -36,6 +36,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.server.core.ServerStatus; +import org.apache.tika.server.core.TikaOpenTelemetry; @Path("/detect") public class DetectorResource { @@ -61,23 +62,55 @@ public String detect(final InputStream is, @Context HttpHeaders httpHeaders, @Co long timeoutMillis = TikaResource.getTaskTimeout(parseContext); long taskId = serverStatus.start(ServerStatus.TASK.DETECT, filename, timeoutMillis); + // Start OpenTelemetry span for detect operation + io.opentelemetry.api.trace.Span span = null; + if (TikaOpenTelemetry.isEnabled()) { + span = TikaOpenTelemetry.getTracer() + .spanBuilder("tika.detect") + .setAttribute("tika.resource_name", filename != null ? filename : "unknown") + .setAttribute("tika.endpoint", "/detect") + .startSpan(); + } + try (TikaInputStream tis = TikaInputStream.get(TikaResource.getInputStream(is, met, httpHeaders, info))) { - return TikaResource + String detectedType = TikaResource .getConfig() .getDetector() .detect(tis, met) .toString(); + + if (span != null) { + span.setAttribute("tika.detected_type", detectedType); + span.setStatus(io.opentelemetry.api.trace.StatusCode.OK); + } + + return detectedType; } catch (IOException e) { LOG.warn("Unable to detect MIME type for file. Reason: {} ({})", e.getMessage(), filename, e); + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "IO error during detection"); + } return MediaType.OCTET_STREAM.toString(); } catch (OutOfMemoryError e) { LOG.error("OOM while detecting: ({})", filename, e); serverStatus.setStatus(ServerStatus.STATUS.ERROR); + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Out of memory"); + } throw e; } catch (Throwable e) { LOG.error("Exception while detecting: ({})", filename, e); + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Detection error"); + } throw e; } finally { + if (span != null) { + span.end(); + } serverStatus.complete(taskId); } } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java index 29e452d945..9a847b8c60 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java @@ -41,8 +41,10 @@ import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.server.core.TikaOpenTelemetry; @Path("/meta") @@ -137,12 +139,44 @@ protected Metadata parseMetadata(InputStream is, Metadata metadata, MultivaluedM //no need to parse embedded docs context.set(DocumentSelector.class, metadata1 -> false); - TikaResource.logRequest(LOG, "/meta", metadata); - TikaResource.parse(parser, LOG, info.getPath(), is, new LanguageHandler() { - public void endDocument() { - metadata.set("language", getLanguage().getLanguage()); + String fileName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + + // Start OpenTelemetry span for metadata extraction operation + io.opentelemetry.api.trace.Span span = null; + if (TikaOpenTelemetry.isEnabled()) { + span = TikaOpenTelemetry.getTracer() + .spanBuilder("tika.metadata.extract") + .setAttribute("tika.resource_name", fileName != null ? fileName : "unknown") + .setAttribute("tika.endpoint", "/meta") + .startSpan(); + } + + try { + TikaResource.logRequest(LOG, "/meta", metadata); + TikaResource.parse(parser, LOG, info.getPath(), is, new LanguageHandler() { + public void endDocument() { + metadata.set("language", getLanguage().getLanguage()); + } + }, metadata, context); + + // Add metadata count to span + if (span != null) { + span.setAttribute("tika.metadata_count", metadata.names().length); + span.setStatus(io.opentelemetry.api.trace.StatusCode.OK); } - }, metadata, context); - return metadata; + + return metadata; + } catch (IOException e) { + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, + "Metadata extraction error"); + } + throw e; + } finally { + if (span != null) { + span.end(); + } + } } } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 7fc042a7d5..0871bba457 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -84,6 +84,7 @@ import org.apache.tika.server.core.InputStreamFactory; import org.apache.tika.server.core.ParseContextConfig; import org.apache.tika.server.core.ServerStatus; +import org.apache.tika.server.core.TikaOpenTelemetry; import org.apache.tika.server.core.TikaServerConfig; import org.apache.tika.server.core.TikaServerParseException; import org.apache.tika.utils.ExceptionUtils; @@ -361,23 +362,64 @@ public static void parse(Parser parser, Logger logger, String path, InputStream long timeoutMillis = getTaskTimeout(parseContext); long taskId = SERVER_STATUS.start(ServerStatus.TASK.PARSE, fileName, timeoutMillis); + + // Start OpenTelemetry span for parse operation + io.opentelemetry.api.trace.Span span = null; + if (TikaOpenTelemetry.isEnabled()) { + span = TikaOpenTelemetry.getTracer() + .spanBuilder("tika.parse") + .setAttribute("tika.resource_name", fileName != null ? fileName : "unknown") + .setAttribute("tika.endpoint", path != null ? path : "/tika") + .startSpan(); + + // Add detected content type if available + String contentType = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE); + if (contentType != null) { + span.setAttribute("tika.content_type", contentType); + } + } + try { parser.parse(inputStream, handler, metadata, parseContext); + + // Mark span as successful and add output metadata + if (span != null) { + span.setStatus(io.opentelemetry.api.trace.StatusCode.OK); + } } catch (SAXException e) { + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "SAX parsing error"); + } throw new TikaServerParseException(e); } catch (EncryptedDocumentException e) { logger.warn("{}: Encrypted document ({})", path, fileName, e); + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Encrypted document"); + } throw new TikaServerParseException(e); } catch (Exception e) { if (!WriteLimitReachedException.isWriteLimitReached(e)) { logger.warn("{}: Text extraction failed ({})", path, fileName, e); } + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Parse error"); + } throw new TikaServerParseException(e); } catch (OutOfMemoryError e) { logger.warn("{}: OOM ({})", path, fileName, e); SERVER_STATUS.setStatus(ServerStatus.STATUS.ERROR); + if (span != null) { + span.recordException(e); + span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Out of memory"); + } throw e; } finally { + if (span != null) { + span.end(); + } SERVER_STATUS.complete(taskId); inputStream.close(); } diff --git a/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml b/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml index 007a1be5a2..ab03aed02e 100644 --- a/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml +++ b/tika-server/tika-server-core/src/main/resources/tika-server-config-default.xml @@ -89,5 +89,33 @@ Not allowed if nofork=true. --> java + + + + + false + + + otlp + + + http://localhost:4317 + + + tika-server + + + 1.0 + + + 30000 + diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java new file mode 100644 index 0000000000..53ed6831fc --- /dev/null +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaOpenTelemetryTest.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.core; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.Tracer; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for OpenTelemetry integration in Tika Server. + *

+ * These tests verify the basic functionality of the OpenTelemetry configuration + * and initialization, including configuration defaults, environment variable loading, + * SDK lifecycle management, and span creation. + *

+ * + * @since 4.0.0 + */ +public class TikaOpenTelemetryTest { + + /** + * Cleans up OpenTelemetry resources after each test. + * Ensures tests don't interfere with each other by shutting down + * the SDK between tests. + */ + @AfterEach + public void cleanup() { + TikaOpenTelemetry.shutdown(); + } + + /** + * Verifies that OpenTelemetry is disabled by default when no + * environment variables or explicit configuration is provided. + */ + @Test + public void testDisabledByDefault() { + // By default, OTEL should be disabled unless configured + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + assertFalse(config.isEnabled(), "OpenTelemetry should be disabled by default"); + } + + /** + * Verifies that configuration defaults are correctly set to + * standard OpenTelemetry values. + */ + @Test + public void testConfigurationDefaults() { + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + + assertEquals("tika-server", config.getServiceName()); + assertEquals("http://localhost:4317", config.getOtlpEndpoint()); + assertEquals(1.0, config.getSamplingProbability()); + assertEquals(30000, config.getExportTimeoutMillis()); + assertEquals("otlp", config.getExporterType()); + } + + /** + * Verifies that configuration can load from environment variables. + * Note: This is a basic test that verifies object creation. + * Full environment variable testing would require system property manipulation. + */ + @Test + public void testEnvironmentVariableLoading() { + // Note: This test would need to set environment variables before creating config + // For now, just verify the config object is created + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + assertNotNull(config); + } + + /** + * Tests full OpenTelemetry SDK initialization with valid configuration. + * Verifies that the SDK is enabled and a valid Tracer instance is returned. + */ + @Test + public void testOpenTelemetryInitialization() { + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + config.setEnabled(true); + config.setOtlpEndpoint("http://localhost:4317"); + + TikaOpenTelemetry.initialize(config); + + assertTrue(TikaOpenTelemetry.isEnabled(), "OpenTelemetry should be enabled after initialization"); + + Tracer tracer = TikaOpenTelemetry.getTracer(); + assertNotNull(tracer, "Tracer should not be null"); + } + + /** + * Verifies that when OpenTelemetry is explicitly disabled in configuration, + * the SDK remains disabled after initialization. + */ + @Test + public void testOpenTelemetryDisabled() { + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + config.setEnabled(false); + + TikaOpenTelemetry.initialize(config); + + assertFalse(TikaOpenTelemetry.isEnabled(), "OpenTelemetry should remain disabled"); + } + + /** + * Tests the creation of custom spans using the OpenTelemetry Tracer. + * Verifies that spans can be created with attributes and properly ended. + */ + @Test + public void testSpanCreation() { + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + config.setEnabled(true); + + TikaOpenTelemetry.initialize(config); + + if (TikaOpenTelemetry.isEnabled()) { + Tracer tracer = TikaOpenTelemetry.getTracer(); + Span span = tracer.spanBuilder("test.operation") + .setAttribute("test.attribute", "test.value") + .startSpan(); + + assertNotNull(span, "Span should be created"); + + span.end(); + } + } + + /** + * Tests all getter and setter methods on TikaOpenTelemetryConfig + * to ensure proper field access and mutation. + */ + @Test + public void testGettersAndSetters() { + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + + config.setEnabled(true); + assertTrue(config.isEnabled()); + + config.setServiceName("test-service"); + assertEquals("test-service", config.getServiceName()); + + config.setOtlpEndpoint("http://test:4317"); + assertEquals("http://test:4317", config.getOtlpEndpoint()); + + config.setSamplingProbability(0.5); + assertEquals(0.5, config.getSamplingProbability()); + + config.setExportTimeoutMillis(60000); + assertEquals(60000, config.getExportTimeoutMillis()); + + config.setExporterType("console"); + assertEquals("console", config.getExporterType()); + } + + /** + * Tests the toString() method of TikaOpenTelemetryConfig. + * Verifies that the string representation includes key configuration values. + */ + @Test + public void testToString() { + TikaOpenTelemetryConfig config = new TikaOpenTelemetryConfig(); + config.setEnabled(true); + config.setServiceName("my-tika"); + + String str = config.toString(); + assertNotNull(str); + assertTrue(str.contains("enabled=true")); + assertTrue(str.contains("my-tika")); + } +} diff --git a/tika-server/tika-server-standard/bin/tika b/tika-server/tika-server-standard/bin/tika index 90f4b39729..61d989c735 100755 --- a/tika-server/tika-server-standard/bin/tika +++ b/tika-server/tika-server-standard/bin/tika @@ -487,7 +487,22 @@ function start_tika() { exit 1 fi - TIKA_START_OPTS=('-server' "${JAVA_MEM_OPTS[@]}" "${GC_TUNE[@]}" "${GC_LOG_OPTS[@]}" \ + # OpenTelemetry Java Agent support + # If OTEL_JAVAAGENT_PATH is set, add the -javaagent flag for auto-instrumentation + OTEL_AGENT_OPTS=() + if [ -n "$OTEL_JAVAAGENT_PATH" ]; then + if [ -f "$OTEL_JAVAAGENT_PATH" ]; then + OTEL_AGENT_OPTS+=("-javaagent:$OTEL_JAVAAGENT_PATH") + if $verbose ; then + echo -e " OTEL Java Agent = $OTEL_JAVAAGENT_PATH" + fi + else + echo -e "\nWARNING: OTEL_JAVAAGENT_PATH is set but file not found: $OTEL_JAVAAGENT_PATH" + echo -e " Continuing without OpenTelemetry auto-instrumentation.\n" + fi + fi + + TIKA_START_OPTS=('-server' "${OTEL_AGENT_OPTS[@]}" "${JAVA_MEM_OPTS[@]}" "${GC_TUNE[@]}" "${GC_LOG_OPTS[@]}" \ "${TIKA_HOST_ARG[@]}" \ "${LOG4J_CONFIG[@]}" "${TIKA_OPTS[@]}") diff --git a/tika-server/tika-server-standard/docker/README.md b/tika-server/tika-server-standard/docker/README.md new file mode 100644 index 0000000000..e773281e3a --- /dev/null +++ b/tika-server/tika-server-standard/docker/README.md @@ -0,0 +1,146 @@ +# OpenTelemetry Observability Stack for Tika Server + +This directory contains Docker Compose configurations for running an observability stack alongside Tika Server. + +## Quick Start + +### 1. Start Jaeger (Traces) + +```bash +docker-compose -f docker-compose-otel.yml up -d jaeger +``` + +This starts Jaeger all-in-one which includes: +- OTLP gRPC receiver on port 4317 +- OTLP HTTP receiver on port 4318 +- Jaeger UI on port 16686 + +### 2. Configure Tika Server + +Set the OTLP endpoint environment variable: + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_SERVICE_NAME=my-tika-server +``` + +Or enable via XML configuration in `tika-server-config.xml`: + +```xml + + true + http://localhost:4317 + my-tika-server + +``` + +### 3. Start Tika Server + +```bash +java -jar tika-server-standard/target/tika-server-standard-*.jar +``` + +### 4. Send Test Requests + +```bash +# Parse a document +curl -T sample.pdf http://localhost:9998/tika + +# Detect MIME type +echo "Hello World" | curl -X PUT --data-binary @- http://localhost:9998/detect/stream + +# Extract metadata +curl -T sample.pdf http://localhost:9998/meta +``` + +### 5. View Traces in Jaeger UI + +Open your browser to: http://localhost:16686 + +- Select "my-tika-server" from the Service dropdown +- Click "Find Traces" +- Click on a trace to see detailed span information + +## Services + +### Jaeger + +Jaeger provides distributed tracing: +- **UI**: http://localhost:16686 +- **OTLP gRPC**: localhost:4317 +- **OTLP HTTP**: localhost:4318 + +### Prometheus (Optional) + +To start Prometheus for metrics collection: + +```bash +docker-compose -f docker-compose-otel.yml --profile with-metrics up -d +``` + +Access Prometheus UI at: http://localhost:9090 + +## Stopping Services + +```bash +docker-compose -f docker-compose-otel.yml down +``` + +To remove volumes as well: + +```bash +docker-compose -f docker-compose-otel.yml down -v +``` + +## Troubleshooting + +### Traces not appearing in Jaeger + +1. Check Tika Server logs for OpenTelemetry initialization messages +2. Verify OTEL_EXPORTER_OTLP_ENDPOINT is set correctly +3. Check Jaeger logs: `docker logs tika-jaeger` +4. Ensure firewall allows connection to port 4317 + +### Connection refused errors + +Make sure Jaeger is running: +```bash +docker ps | grep jaeger +``` + +If using Docker on Mac/Windows, use `host.docker.internal` instead of `localhost`: +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4317 +``` + +## Advanced Configuration + +### Custom Prometheus Configuration + +Create a `prometheus.yml` file in this directory: + +```yaml +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'tika-server' + static_configs: + - targets: ['host.docker.internal:9998'] +``` + +Then start with the metrics profile. + +### Using with Tika Docker Container + +If running Tika Server in Docker, add it to the same network: + +```bash +docker run -d \ + --name tika-server \ + --network tika-otel_tika-otel \ + -e OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4317 \ + -e OTEL_SERVICE_NAME=tika-server \ + -p 9998:9998 \ + apache/tika:latest +``` diff --git a/tika-server/tika-server-standard/docker/docker-compose-otel.yml b/tika-server/tika-server-standard/docker/docker-compose-otel.yml new file mode 100644 index 0000000000..ff02b77a08 --- /dev/null +++ b/tika-server/tika-server-standard/docker/docker-compose-otel.yml @@ -0,0 +1,47 @@ +# Docker Compose configuration for OpenTelemetry observability stack +# This sets up Jaeger for trace collection and visualization +version: '3.8' + +services: + # Jaeger all-in-one: collector, query service, and UI + jaeger: + image: jaegertracing/all-in-one:latest + container_name: tika-jaeger + ports: + - "16686:16686" # Jaeger UI + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + - "14250:14250" # Jaeger gRPC + environment: + - COLLECTOR_OTLP_ENABLED=true + - LOG_LEVEL=info + networks: + - tika-otel + restart: unless-stopped + + # Optional: Prometheus for metrics collection + prometheus: + image: prom/prometheus:latest + container_name: tika-prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - tika-otel + restart: unless-stopped + profiles: + - with-metrics + +networks: + tika-otel: + driver: bridge + +volumes: + prometheus-data: diff --git a/tika-server/tika-server-standard/otel-agent/README.md b/tika-server/tika-server-standard/otel-agent/README.md new file mode 100644 index 0000000000..e270d55099 --- /dev/null +++ b/tika-server/tika-server-standard/otel-agent/README.md @@ -0,0 +1,216 @@ +# OpenTelemetry Java Agent for Tika Server + +This directory contains instructions for using the OpenTelemetry Java Agent for automatic instrumentation of Tika Server. + +## What is Auto-Instrumentation? + +The OpenTelemetry Java Agent provides automatic instrumentation for many popular Java libraries without requiring code changes. When attached to Tika Server, it automatically captures: + +- HTTP request/response traces (via Jetty instrumentation) +- JDBC database calls (if using database features) +- Additional framework-level spans + +This complements the manual instrumentation already present in Tika Server for Tika-specific operations. + +## Download the Agent + +### Option 1: Direct Download + +Download the latest OpenTelemetry Java Agent: + +```bash +curl -L -O https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar +``` + +### Option 2: Maven + +Add to your pom.xml and use Maven to download: + +```xml + + io.opentelemetry.javaagent + opentelemetry-javaagent + 2.10.0 + runtime + +``` + +### Option 3: Specific Version + +```bash +VERSION=2.10.0 +curl -L -O "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v${VERSION}/opentelemetry-javaagent.jar" +``` + +## Usage + +### Basic Usage + +```bash +java -javaagent:opentelemetry-javaagent.jar \ + -Dotel.service.name=tika-server \ + -Dotel.exporter.otlp.endpoint=http://localhost:4317 \ + -jar tika-server-standard-*.jar +``` + +### With Environment Variables + +```bash +export OTEL_SERVICE_NAME=tika-server +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_TRACES_EXPORTER=otlp +export OTEL_METRICS_EXPORTER=otlp +export OTEL_LOGS_EXPORTER=otlp + +java -javaagent:opentelemetry-javaagent.jar \ + -jar tika-server-standard-*.jar +``` + +### Using the Tika Startup Script + +If you place the agent JAR in this directory, you can use an environment variable: + +```bash +export OTEL_JAVAAGENT_PATH=/path/to/otel-agent/opentelemetry-javaagent.jar +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 + +./bin/tika +``` + +## Configuration Options + +### Common System Properties + +| Property | Description | Default | +|----------|-------------|---------| +| `otel.service.name` | Service name for traces | `unknown_service:java` | +| `otel.exporter.otlp.endpoint` | OTLP endpoint URL | `http://localhost:4317` | +| `otel.traces.sampler` | Sampling strategy | `parentbased_always_on` | +| `otel.traces.sampler.arg` | Sampler argument (e.g., probability) | - | +| `otel.instrumentation.http.capture-headers.server.request` | Capture request headers | - | +| `otel.instrumentation.http.capture-headers.server.response` | Capture response headers | - | + +### Sampling Configuration + +Sample 10% of traces: +```bash +-Dotel.traces.sampler=traceidratio \ +-Dotel.traces.sampler.arg=0.1 +``` + +### Disable Specific Instrumentations + +```bash +# Disable JDBC instrumentation +-Dotel.instrumentation.jdbc.enabled=false + +# Disable HTTP client instrumentation +-Dotel.instrumentation.http-url-connection.enabled=false +``` + +## What Gets Instrumented Automatically? + +With the Java Agent attached, you'll see additional spans for: + +### HTTP Server (Jetty/CXF) +- `HTTP GET /tika` +- `HTTP PUT /detect/stream` +- Request/response headers +- HTTP status codes +- Request duration + +### JDBC (if used) +- Database queries +- Connection pool metrics +- Transaction boundaries + +### JVM Metrics +- Memory usage +- GC activity +- Thread counts +- CPU usage + +## Combining Auto and Manual Instrumentation + +The Java Agent works seamlessly with Tika's manual instrumentation: + +1. **Auto-instrumentation** creates high-level HTTP request spans +2. **Manual instrumentation** creates detailed Tika-specific spans (parse, detect, metadata) +3. Spans are automatically nested, showing the complete request flow + +Example trace structure: +``` +HTTP PUT /tika (auto-instrumented) + └─ tika.parse (manual) + ├─ parser.initialization + └─ content.extraction +``` + +## Verifying Installation + +After starting Tika Server with the agent: + +1. Check logs for OpenTelemetry initialization: + ``` + [otel.javaagent] OpenTelemetry Java Agent ... + ``` + +2. Send a test request: + ```bash + curl -T test.pdf http://localhost:9998/tika + ``` + +3. View traces in Jaeger UI (http://localhost:16686) + +4. Look for both auto-instrumented (`HTTP PUT`) and manual (`tika.parse`) spans + +## Performance Considerations + +The Java Agent adds minimal overhead: +- Typical overhead: 1-3% +- Can be reduced with sampling +- Async span export prevents blocking + +To measure impact: +```bash +# Without agent +time java -jar tika-server-standard-*.jar & + +# With agent +time java -javaagent:opentelemetry-javaagent.jar -jar tika-server-standard-*.jar & +``` + +## Troubleshooting + +### Agent not loading + +Ensure the JAR path is correct: +```bash +java -javaagent:/full/path/to/opentelemetry-javaagent.jar -jar ... +``` + +### No auto-instrumented spans + +Check that supported libraries are being used: +```bash +-Dotel.javaagent.debug=true +``` + +### Conflicts with manual instrumentation + +The agent is compatible with manual SDK usage. Spans are automatically correlated. + +### Excessive spans + +Disable unwanted instrumentations: +```bash +-Dotel.instrumentation.common.default-enabled=false \ +-Dotel.instrumentation.jetty.enabled=true \ +-Dotel.instrumentation.http.enabled=true +``` + +## Further Reading + +- [OpenTelemetry Java Agent Documentation](https://opentelemetry.io/docs/instrumentation/java/automatic/) +- [Supported Libraries and Frameworks](https://github.com/open-telemetry/opentelemetry-java-instrumentation/blob/main/docs/supported-libraries.md) +- [Configuration Options](https://opentelemetry.io/docs/instrumentation/java/automatic/agent-config/)