|
1 | 1 | package dev.ikm.server.cosmos.constellation.charting; |
2 | 2 |
|
| 3 | +import java.util.ArrayList; |
| 4 | +import java.util.List; |
| 5 | +import java.util.Map; |
| 6 | +import java.util.Optional; |
| 7 | +import java.util.UUID; |
| 8 | + |
3 | 9 | import org.springframework.stereotype.Component; |
4 | 10 |
|
| 11 | +import dev.ikm.tinkar.common.id.IntIdSet; |
| 12 | +import dev.ikm.tinkar.common.service.PrimitiveData; |
| 13 | +import dev.ikm.tinkar.coordinate.stamp.calculator.Latest; |
| 14 | +import dev.ikm.tinkar.entity.PatternEntityVersion; |
| 15 | +import dev.ikm.tinkar.entity.SemanticEntityVersion; |
| 16 | +import dev.ikm.tinkar.terms.TinkarTermV2; |
| 17 | +import dev.langchain4j.data.document.Metadata; |
| 18 | +import dev.langchain4j.data.embedding.Embedding; |
5 | 19 | import dev.langchain4j.data.segment.TextSegment; |
6 | 20 | import dev.langchain4j.model.embedding.EmbeddingModel; |
7 | 21 | import dev.langchain4j.store.embedding.EmbeddingStore; |
8 | 22 |
|
| 23 | +/*** |
| 24 | + * Concept: [Primary Name]. |
| 25 | + * Category: [Immediate Parent/Is-A]. |
| 26 | + * Synonyms: [Comma-separated aliases]. |
| 27 | + * Description: [Primary Definition Semantic]. |
| 28 | + * Available Semantic Features: [Semantic Name 1] (Provides [Brief |
| 29 | + * Purpose/Meaning]), [Semantic Name 2] (Provides [Brief Purpose/Meaning]). |
| 30 | + */ |
| 31 | + |
9 | 32 | @Component |
10 | 33 | public class EmbeddingChartProcessor implements ChartProcessor { |
11 | 34 |
|
12 | | - protected EmbeddingStore<TextSegment> embeddingStore; |
13 | | - protected EmbeddingModel embeddingModel; |
14 | | - |
15 | | - @Override |
16 | | - public String getProcessorName() { |
17 | | - return "Embedding Chart Processor"; |
18 | | - } |
19 | | - |
20 | | - @Override |
21 | | - public void process(ChartingContext chartingContext) { |
22 | | - |
23 | | - |
24 | | - // final List<TextSegment> names = batch.stream().map(this::generateEmbeddingName).map(TextSegment::from).toList(); |
25 | | - // final List<Metadata> metadata = batch.stream().map(row -> Map.of( |
26 | | - // "id", String.valueOf(row.get("id")), |
27 | | - // "constellationId", String.valueOf(row.get("constellationId")))) |
28 | | - // .map(Metadata::from) |
29 | | - // .toList(); |
30 | | - // final List<Embedding> embeddings = embeddingModel.embedAll(names).content(); |
31 | | - // final List<TextSegment> segments = new ArrayList<>(); |
32 | | - |
33 | | - // //Create Segments for vector store |
34 | | - // for (int i = 0; i < names.size(); i++) { |
35 | | - // segments.add(TextSegment.from(names.get(i).text(), metadata.get(i))); |
36 | | - // } |
37 | | - // embeddingStore.addAll(embeddings, segments); |
38 | | - |
39 | | - |
40 | | - chartingContext.progressUpdate().accept(50); // Update progress to 50% as an example |
41 | | - } |
| 35 | + private record EmbeddingData(TextSegment name, Metadata metadata) { |
| 36 | + } |
| 37 | + |
| 38 | + private final EmbeddingStore<TextSegment> embeddingStore; |
| 39 | + private final EmbeddingModel embeddingModel; |
| 40 | + |
| 41 | + public EmbeddingChartProcessor(EmbeddingStore<TextSegment> embeddingStore, EmbeddingModel embeddingModel) { |
| 42 | + this.embeddingStore = embeddingStore; |
| 43 | + this.embeddingModel = embeddingModel; |
| 44 | + } |
| 45 | + |
| 46 | + @Override |
| 47 | + public String getProcessorName() { |
| 48 | + return "Embedding Chart Processor"; |
| 49 | + } |
| 50 | + |
| 51 | + @Override |
| 52 | + public void process(ChartingContext chartingContext) { |
| 53 | + List<EmbeddingData> embeddingBatch = new ArrayList<>(); |
| 54 | + |
| 55 | + chartingContext.chart().scopes().stream() |
| 56 | + .map(scope -> chartingContext.chart().navigationCalculator().kindOf(scope.id().nid())) |
| 57 | + .flatMap(intIdSet -> intIdSet.intStream().boxed()) |
| 58 | + .forEach(nid -> { |
| 59 | + EmbeddingData embeddingData = generateEmbeddingName(nid, chartingContext.chart().constellationId(), |
| 60 | + chartingContext); |
| 61 | + embeddingBatch.add(embeddingData); |
| 62 | + if (embeddingBatch.size() == chartingContext.batchSize()) { // Process in batches of 100 |
| 63 | + processBatch(embeddingBatch, chartingContext); |
| 64 | + embeddingBatch.clear(); |
| 65 | + } |
| 66 | + }); |
| 67 | + |
| 68 | + // Process any remaining items in the batch |
| 69 | + if (!embeddingBatch.isEmpty()) { |
| 70 | + processBatch(embeddingBatch, chartingContext); |
| 71 | + } |
| 72 | + } |
| 73 | + |
| 74 | + private EmbeddingData generateEmbeddingName(int nid, UUID constelationId, ChartingContext chartingContext) { |
| 75 | + // Generate a descriptive name for the embedding based on the concept's |
| 76 | + // attributes |
| 77 | + StringBuilder nameBuilder = new StringBuilder(); |
| 78 | + nameBuilder.append("Concept: " + generateConcept(nid, chartingContext)).append(". "); |
| 79 | + nameBuilder.append("Category: " + generateCategory(nid, chartingContext)).append(". "); |
| 80 | + nameBuilder.append("Synonyms: " + generateSynonyms(nid, chartingContext)).append(". "); |
| 81 | + nameBuilder.append("Description: " + generateDescription(nid, chartingContext)).append(". "); |
| 82 | + nameBuilder.append("Available Semantic Features: " + generateSemanticFeatures(nid, chartingContext)) |
| 83 | + .append(". "); |
| 84 | + // Create metadata for the embedding - this will help to filter based on |
| 85 | + // constellation used in prompts |
| 86 | + Metadata metadata = Metadata.from(Map.of( |
| 87 | + "id", nid, |
| 88 | + "constellationId", constelationId.toString())); |
| 89 | + return new EmbeddingData(TextSegment.from(nameBuilder.toString()), metadata); |
| 90 | + } |
| 91 | + |
| 92 | + private String generateConcept(int nid, ChartingContext chartingContext) { |
| 93 | + return chartingContext.chart().languageCalculator().getFullyQualifiedDescriptionTextWithFallbackOrNid(nid); |
| 94 | + } |
| 95 | + |
| 96 | + private String generateCategory(int nid, ChartingContext chartingContext) { |
| 97 | + StringBuilder categoryBuilder = new StringBuilder(); |
| 98 | + IntIdSet intIdSet = chartingContext.chart().navigationCalculator().ancestorsOf(nid); |
| 99 | + int[] nids = intIdSet.toArray(); |
| 100 | + |
| 101 | + for (int i = 0; i < nids.length && i < 4; i++) { |
| 102 | + if (i != TinkarTermV2.INTEGRATED_KNOWLEDGE_MANAGEMENT.nid()) { |
| 103 | + categoryBuilder.append( |
| 104 | + chartingContext.chart().languageCalculator().getDescriptionTextOrNid(nids[i])) |
| 105 | + .append(", "); |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + return categoryBuilder.toString().substring(0, categoryBuilder.length() - 2); // Remove trailing ", " |
| 110 | + } |
| 111 | + |
| 112 | + private String generateSynonyms(int nid, ChartingContext chartingContext) { |
| 113 | + Optional<String> regularName = chartingContext.chart().languageCalculator().getRegularDescriptionText(nid); |
| 114 | + if (regularName.isPresent()) { |
| 115 | + return regularName.get(); |
| 116 | + } |
| 117 | + return ""; |
| 118 | + } |
| 119 | + |
| 120 | + private String generateDescription(int nid, ChartingContext chartingContext) { |
| 121 | + Optional<String> descriptionName = chartingContext.chart().languageCalculator().getDescriptionText(nid); |
| 122 | + if (descriptionName.isPresent()) { |
| 123 | + return descriptionName.get(); |
| 124 | + } |
| 125 | + return ""; |
| 126 | + } |
| 127 | + |
| 128 | + private String generateSemanticFeatures(int nid, ChartingContext chartingContext) { |
| 129 | + StringBuilder semanticFeaturesBuilder = new StringBuilder(); |
| 130 | + PrimitiveData.get().forEachSemanticNidForComponent(nid, semanticNid -> { |
| 131 | + Latest<SemanticEntityVersion> latestSemanticEntityVersion = chartingContext.chart().stampCalculator() |
| 132 | + .latest(semanticNid); |
| 133 | + if (latestSemanticEntityVersion.isPresent()) { |
| 134 | + SemanticEntityVersion semanticEntityVersion = latestSemanticEntityVersion.get(); |
| 135 | + Latest<PatternEntityVersion> latestPatternEntityVersion = chartingContext.chart().stampCalculator() |
| 136 | + .latest(semanticEntityVersion.patternNid()); |
| 137 | + if (latestPatternEntityVersion.isPresent()) { |
| 138 | + PatternEntityVersion patternEntityVersion = latestPatternEntityVersion.get(); |
| 139 | + String semanticName = chartingContext.chart().languageCalculator() |
| 140 | + .getDescriptionTextOrNid(patternEntityVersion.nid()); |
| 141 | + String purpose = chartingContext.chart().languageCalculator() |
| 142 | + .getDescriptionTextOrNid(patternEntityVersion.semanticPurposeNid()); |
| 143 | + semanticFeaturesBuilder.append(semanticName + " (Provides " + purpose + "), "); |
| 144 | + } |
| 145 | + } |
| 146 | + }); |
| 147 | + return semanticFeaturesBuilder.toString().substring(0, semanticFeaturesBuilder.length() - 2); |
| 148 | + } |
42 | 149 |
|
| 150 | + private void processBatch(List<EmbeddingData> embeddingBatch, ChartingContext chartingContext) { |
| 151 | + // Generate embeddings values from string names |
| 152 | + List<TextSegment> names = embeddingBatch.stream().map(EmbeddingData::name).toList(); |
| 153 | + List<Embedding> embeddings = embeddingModel.embedAll(names).content(); |
| 154 | + // Create Segments for vector store with embedding values and metadata |
| 155 | + List<TextSegment> segmentsWithMetaData = embeddingBatch.stream() |
| 156 | + .map(embeddingData -> TextSegment.from(embeddingData.name().text(), embeddingData.metadata())) |
| 157 | + .toList(); |
| 158 | + embeddingStore.addAll(embeddings, segmentsWithMetaData); |
| 159 | + chartingContext.progressUpdate().accept(chartingContext.batchSize()); |
| 160 | + } |
43 | 161 | } |
0 commit comments