RAG1. easy ragTestvoidtest03(){// 1. 创建模型// 2. 加载文档ListDocumentdocumentsClassPathDocumentLoader.loadDocuments(excel);//ListDocument documents FileSystemDocumentLoader.loadDocuments(/home/langchain4j/documentation);// 递归去加载// ListDocument documents FileSystemDocumentLoader.loadDocumentsRecursively(/home/langchain4j/documentation);// 通过正则表达式去过滤对应的文件//PathMatcher pathMatcher FileSystems.getDefault().getPathMatcher(glob:*.pdf);//ListDocument documents FileSystemDocumentLoader.loadDocuments(/home/langchain4j/documentation, pathMatcher);//3. 把文档存储到EmbeddingStoreInMemoryEmbeddingStoreTextSegmentembeddingStorenewInMemoryEmbeddingStore();EmbeddingStoreIngestor.ingest(documents,embeddingStore);//4. 配置embeddingStore到AiServices里面AssistantassistantAiServices.builder(Assistant.class).chatModel(BASE_MODEL).chatMemory(MessageWindowChatMemory.withMaxMessages(10)).contentRetriever(EmbeddingStoreContentRetriever.from(embeddingStore)).build();//String chat assistant.chat();// 5. 执行问答Stringanswerassistant.chat(当前活动节点是处理市场与销售的活动给我推荐几个动词和名词);System.out.println(answer);}5个步骤;创建模型加载文档把文档存储到EmbeddingStore配置embeddingStore到AiServices里面执行问答publicclassEmbeddingStoreIngestor{privatestaticfinalLoggerlogLoggerFactory.getLogger(EmbeddingStoreIngestor.class);privatefinalDocumentTransformerdocumentTransformer;privatefinalDocumentSplitterdocumentSplitter;privatefinalTextSegmentTransformertextSegmentTransformer;privatefinalEmbeddingModelembeddingModel;privatefinalEmbeddingStoreTextSegmentembeddingStore;/** * 1. 通过DocumentTransformer转换 documents documentTransformer.transformAll(documents); * 2. 将documents切割为segments segments documentSplitter.splitAll(documents); * 3. textSegmentTransformer segments textSegmentTransformer.transformAll(segments); * 4. 通过embeddingModel把segments转换为 ListEmbedding ResponseListEmbedding embeddingsResponse embeddingModel.embedAll(segments); * 5. 保存到embeddingStore embeddingStore.addAll(embeddingsResponse.content(), segments); */publicIngestionResultingest(ListDocumentdocuments){log.debug(Starting to ingest {} documents,documents.size());if(documentTransformer!null){documentsdocumentTransformer.transformAll(documents);log.debug(Documents were transformed into {} documents,documents.size());}ListTextSegmentsegments;if(documentSplitter!null){segmentsdocumentSplitter.splitAll(documents);log.debug(Documents were split into {} text segments,segments.size());}else{segmentsdocuments.stream().map(Document::toTextSegment).collect(toList());}if(textSegmentTransformer!null){segmentstextSegmentTransformer.transformAll(segments);log.debug({} documents were transformed into {} text segments,documents.size(),segments.size());}log.debug(Starting to embed {} text segments,segments.size());ResponseListEmbeddingembeddingsResponseembeddingModel.embedAll(segments);log.debug(Finished embedding {} text segments,segments.size());log.debug(Starting to store {} text segments into the embedding store,segments.size());embeddingStore.addAll(embeddingsResponse.content(),segments);log.debug(Finished storing {} text segments into the embedding store,segments.size());returnnewIngestionResult(embeddingsResponse.tokenUsage());}}DocumentSplitter 的本质是把面向人阅读的连续长文本转换成面向向量检索的独立语义单元这是连接文档存储和精准检索之间的桥梁。DocumentByParagraphSplitter DocumentByLineSplitter DocumentBySentenceSplitter DocumentByWordSplitter DocumentByCharacterSplitter DocumentByRegexSplitter Recursive: DocumentSplitters.recursive(...)Embedding/** * Embedding.dimension() returns the dimension of the embedding vector (its length) * CosineSimilarity.between(Embedding, Embedding) calculates the cosine similarity between 2 Embeddings * Embedding.normalize() normalizes the embedding vector (in place) * * EmbeddingModel.embed(String) embeds the given text * EmbeddingModel.embed(TextSegment) embeds the given TextSegment * EmbeddingModel.embedAll(ListTextSegment) embeds all the given TextSegment */Testvoidtest01(){EmbeddingModelembeddingModelnewBgeSmallEnV15QuantizedEmbeddingModel();ResponseEmbeddingembeddingResponseembeddingModel.embed(男人);EmbeddingmaleembeddingResponse.content();ResponseEmbeddingembeddingResponse2embeddingModel.embed(女人);EmbeddingfemaleembeddingResponse2.content();doublebetween1CosineSimilarity.between(male,female);ResponseEmbeddingembeddingResponse3embeddingModel.embed(猿猴);EmbeddingmonkeyembeddingResponse3.content();doublebetween2CosineSimilarity.between(male,monkey);}Advanced RAGQueryTransformer: 核心目的是通过改写、扩展或压缩查询来提高检索质量。 QueryRouter: 决定将处理好的查询Query发送给哪一个或哪几个“内容检索器”,其核心目的是确保查询能准确到达相关的数据源实现精准检索; ContentRetriever: 真正与数据源交互根据查询去“找出”相关的原始内容。 ContentAggregator 将来自不同查询、不同检索器的所有“原材料”进行“清洗、融合和精选”。 ContentInjector将最终聚合好的、高质量的内容以特定的格式“注入”回用户的原始消息中。核心的代码逻辑如下 DefaultRetrievalAugmentorOverridepublicAugmentationResultaugment(AugmentationRequestaugmentationRequest){ChatMessagechatMessageaugmentationRequest.chatMessage();StringqueryText;if(chatMessageinstanceofUserMessageuserMessage){queryTextuserMessage.singleText();}else{thrownewIllegalArgumentException(Unsupported message type: chatMessage.type());}QueryoriginalQueryQuery.from(queryText,augmentationRequest.metadata());CollectionQueryqueriesqueryTransformer.transform(originalQuery);MapQuery,CollectionListContentqueryToContentsprocess(queries);ListContentcontentscontentAggregator.aggregate(queryToContents);ChatMessageaugmentedChatMessagecontentInjector.inject(contents,chatMessage);returnAugmentationResult.builder().chatMessage(augmentedChatMessage).contents(contents).build();}privateMapQuery,CollectionListContentprocess(CollectionQueryqueries){if(queries.size()1){Queryqueryqueries.iterator().next();CollectionContentRetrieverretrieversqueryRouter.route(query);if(retrievers.size()1){ContentRetrievercontentRetrieverretrievers.iterator().next();ListContentcontentscontentRetriever.retrieve(query);returnsingletonMap(query,singletonList(contents));}elseif(retrievers.size()1){CollectionListContentcontentsretrieveFromAll(retrievers,query).join();returnsingletonMap(query,contents);}else{returnemptyMap();}}elseif(queries.size()1){MapQuery,CompletableFutureCollectionListContentqueryToFutureContentsnewConcurrentHashMap();queries.forEach(query-{CompletableFutureCollectionListContentfutureContentssupplyAsync(()-queryRouter.route(query),executor).thenCompose(retrievers-retrieveFromAll(retrievers,query));queryToFutureContents.put(query,futureContents);});returnjoin(queryToFutureContents);}else{returnemptyMap();}}