From db944818bdeeb81a265aa8a4f158f44c298e790e Mon Sep 17 00:00:00 2001 From: Vijay Krishna <39220950+vijaykriishna@users.noreply.github.com> Date: Thu, 16 Oct 2025 16:54:22 +0530 Subject: [PATCH] LUCENE-15196: Support multiple delimiters --- .../analysis/path/PathHierarchyTokenizer.java | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index dfd727570342..d90d7e4d068d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -17,6 +17,9 @@ package org.apache.lucene.analysis.path; import java.io.IOException; +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -84,17 +87,28 @@ public PathHierarchyTokenizer( } termAtt.resizeBuffer(bufferSize); - this.delimiter = delimiter; + this.delimiters = Set.of(delimiter); this.replacement = replacement; this.skip = skip; resultToken = new StringBuilder(bufferSize); } + public PathHierarchyTokenizer(Set delimiters, char replacement, int skip) { + super(DEFAULT_TOKEN_ATTRIBUTE_FACTORY); + this.delimiters = + Objects.isNull(delimiters) || delimiters.isEmpty() + ? new HashSet<>(DEFAULT_DELIMITER) + : delimiters; + this.replacement = replacement; + this.skip = skip; + resultToken = new StringBuilder(DEFAULT_BUFFER_SIZE); + } + private static final int DEFAULT_BUFFER_SIZE = 1024; public static final char DEFAULT_DELIMITER = '/'; public static final int DEFAULT_SKIP = 0; - private final char delimiter; + private Set delimiters; private final char replacement; private final int skip; @@ -145,13 +159,13 @@ public final boolean incrementToken() throws IOException { added = true; skipped++; if (skipped > skip) { - termAtt.append(c == delimiter ? replacement : (char) c); + termAtt.append(delimiters.contains((char) c) ? replacement : (char) c); length++; } else { startPosition++; } } else { - if (c == delimiter) { + if (delimiters.contains((char) c)) { if (skipped > skip) { endDelimiter = true; break;