add excel导入

2026-02-17 22:10:14 +08:00 · 2024-04-05 20:23:03 +08:00
parent 9f33b5009b
commit 8a0609e970
4 changed files with 186 additions and 5 deletions
--- a/src/AntSK.Domain/AntSK.Domain.xml
+++ b/src/AntSK.Domain/AntSK.Domain.xml
@@ -182,6 +182,12 @@
            模型写死
            </summary>
        </member>
+        <member name="P:AntSK.Domain.Domain.Other.KMExcelHandler.StepName">
+            <inheritdoc />
+        </member>
+        <member name="M:AntSK.Domain.Domain.Other.KMExcelHandler.InvokeAsync(Microsoft.KernelMemory.Pipeline.DataPipeline,System.Threading.CancellationToken)">
+            <inheritdoc />
+        </member>
        <member name="F:AntSK.Domain.Domain.Other.LLamaConfig.dicLLamaWeights">
            <summary>
            避免模型重复加载，本地缓存
--- a/src/AntSK.Domain/Domain/Model/Constant/KmsConstantcs.cs
+++ b/src/AntSK.Domain/Domain/Model/Constant/KmsConstantcs.cs
@@ -28,5 +28,7 @@ namespace AntSK.Domain.Domain.Model.Constant
 历史聊天记录:{{ConversationSummaryPlugin.SummarizeConversation $history}}
 --------------------------
 用户问题: {{$input}}";
+
+        public const string KMExcelSplit = "*&antsk_excel&*";
    }
 }
--- a/src/AntSK.Domain/Domain/Other/KMExcelHandler.cs
+++ b/src/AntSK.Domain/Domain/Other/KMExcelHandler.cs
@@ -0,0 +1,156 @@
+using AntSK.Domain.Domain.Model.Constant;
+using Microsoft.Extensions.Logging;
+using Microsoft.KernelMemory.AI.OpenAI;
+using Microsoft.KernelMemory.Configuration;
+using Microsoft.KernelMemory.DataFormats.Text;
+using Microsoft.KernelMemory.Diagnostics;
+using Microsoft.KernelMemory.Extensions;
+using Microsoft.KernelMemory.Pipeline;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AntSK.Domain.Domain.Other
+{
+    public class KMExcelHandler: IPipelineStepHandler
+    {
+        private readonly TextPartitioningOptions _options;
+        private readonly IPipelineOrchestrator _orchestrator;
+        private readonly ILogger<KMExcelHandler> _log;
+        private readonly TextChunker.TokenCounter _tokenCounter;
+
+        public KMExcelHandler(
+            string stepName,
+            IPipelineOrchestrator orchestrator,
+             TextPartitioningOptions? options = null,
+            ILogger<KMExcelHandler>? log = null)
+        {
+            this.StepName = stepName;
+            this._orchestrator = orchestrator;
+            this._options = options ?? new TextPartitioningOptions();
+            this._options.Validate();
+
+            this._log = log ?? DefaultLogger<KMExcelHandler>.Instance;
+            this._tokenCounter = DefaultGPTTokenizer.StaticCountTokens;
+        }
+
+        /// <inheritdoc />
+        public string StepName { get; }
+
+        /// <inheritdoc />
+        public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync(
+            DataPipeline pipeline, CancellationToken cancellationToken = default)
+        {
+            this._log.LogDebug("Partitioning text, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId);
+
+            if (pipeline.Files.Count == 0)
+            {
+                this._log.LogWarning("Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
+                return (true, pipeline);
+            }
+
+            foreach (DataPipeline.FileDetails uploadedFile in pipeline.Files)
+            {
+                // Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
+                Dictionary<string, DataPipeline.GeneratedFileDetails> newFiles = new();
+
+                foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> generatedFile in uploadedFile.GeneratedFiles)
+                {
+                    var file = generatedFile.Value;
+                    if (file.AlreadyProcessedBy(this))
+                    {
+                        this._log.LogTrace("File {0} already processed by this handler", file.Name);
+                        continue;
+                    }
+
+                    // Partition only the original text
+                    if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
+                    {
+                        this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
+                        continue;
+                    }
+
+                    // Use a different partitioning strategy depending on the file type
+                    List<string> partitions;
+                    List<string> sentences;
+                    BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
+
+                    // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
+                    if (partitionContent.ToArray().Length == 0) { continue; }
+
+                    switch (file.MimeType)
+                    {
+                        case MimeTypes.PlainText:
+                            {
+                                this._log.LogDebug("Partitioning text file {0}", file.Name);
+                                string content = partitionContent.ToString();
+                                var excelList = content.Split(KmsConstantcs.KMExcelSplit, StringSplitOptions.RemoveEmptyEntries).ToList();
+                                sentences = excelList;
+                                partitions = excelList;
+                                break;
+                            }
+
+                        case MimeTypes.MarkDown:
+                            {
+                                this._log.LogDebug("Partitioning text file {0}", file.Name);
+                                string content = partitionContent.ToString();
+                                var excelList = content.Split(KmsConstantcs.KMExcelSplit, StringSplitOptions.RemoveEmptyEntries).ToList();
+                                sentences = excelList;
+                                partitions = excelList;
+                                break;
+                            }
+                        default:
+                            this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
+                            // Don't partition other files
+                            continue;
+                    }
+
+                    if (partitions.Count == 0) { continue; }
+
+                    this._log.LogDebug("Saving {0} file partitions", partitions.Count);
+                    for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
+                    {
+                        // TODO: turn partitions in objects with more details, e.g. page number
+                        string text = partitions[partitionNumber];
+                        int sectionNumber = 0; // TODO: use this to store the page number (if any)
+                        BinaryData textData = new(text);
+
+                        int tokenCount = this._tokenCounter(text);
+                        this._log.LogDebug("Partition size: {0} tokens", tokenCount);
+
+                        var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
+                        await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
+
+                        var destFileDetails = new DataPipeline.GeneratedFileDetails
+                        {
+                            Id = Guid.NewGuid().ToString("N"),
+                            ParentId = uploadedFile.Id,
+                            Name = destFile,
+                            Size = text.Length,
+                            MimeType = MimeTypes.PlainText,
+                            ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
+                            PartitionNumber = partitionNumber,
+                            SectionNumber = sectionNumber,
+                            Tags = pipeline.Tags,
+                            ContentSHA256 = textData.CalculateSHA256(),
+                        };
+                        newFiles.Add(destFile, destFileDetails);
+                        destFileDetails.MarkProcessedBy(this);
+                    }
+
+                    file.MarkProcessedBy(this);
+                }
+
+                // Add new files to pipeline status
+                foreach (var file in newFiles)
+                {
+                    uploadedFile.GeneratedFiles.Add(file.Key, file.Value);
+                }
+            }
+
+            return (true, pipeline);
+        }
+    }
+}
--- a/src/AntSK.Domain/Domain/Service/ImportKMSService.cs
+++ b/src/AntSK.Domain/Domain/Service/ImportKMSService.cs
@@ -3,8 +3,11 @@ using AntSK.Domain.Domain.Interface;
 using AntSK.Domain.Domain.Model;
 using AntSK.Domain.Domain.Model.Constant;
 using AntSK.Domain.Domain.Model.Excel;
+using AntSK.Domain.Domain.Other;
 using AntSK.Domain.Repositories;
 using Microsoft.KernelMemory;
+using Microsoft.KernelMemory.Handlers;
+using System.Text;

 namespace AntSK.Domain.Domain.Service
 {
@@ -68,14 +71,28 @@ namespace AntSK.Domain.Domain.Service
                    case ImportType.Excel:
                        using (var fs = File.OpenRead(req.FilePath))
                        {
-                           var excelList= ExeclHelper.ExcelToList<KMSExcelModel>(fs);
+                            var excelList= ExeclHelper.ExcelToList<KMSExcelModel>(fs);
+                            
+                            _memory.Orchestrator.AddHandler<TextExtractionHandler>("extract_text");
+                            _memory.Orchestrator.AddHandler<KMExcelHandler>("antsk_excel_split");
+                            _memory.Orchestrator.AddHandler<GenerateEmbeddingsHandler>("generate_embeddings");
+                            _memory.Orchestrator.AddHandler<SaveRecordsHandler>("save_memory_records");
+
+                            StringBuilder text = new StringBuilder();
                            foreach (var item in excelList)
                            {
-                                var text = @$"Question:{item.Question}{Environment.NewLine} Answer:{item.Answer}";
-                                var importResult = _memory.ImportTextAsync(text, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } }
-                                    , index: KmsConstantcs.KmsIndex).Result;
+                                text.AppendLine(@$"Question:{item.Question}{Environment.NewLine}Answer:{item.Answer}{KmsConstantcs.KMExcelSplit}");                            
                            }
-                            var testList = _kMService.GetDocumentByFileID(km.Id, fileid).Result;
+                            var importResult = _memory.ImportTextAsync(text.ToString(), fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } }
+                                  , index: KmsConstantcs.KmsIndex,
+                                  steps: new[]
+                                  {
+                                        "extract_text",
+                                        "antsk_excel_split",
+                                        "generate_embeddings",
+                                        "save_memory_records"
+                                  }
+                                  ).Result;
                            req.KmsDetail.FileName = req.FileName;
                            string fileGuidName = Path.GetFileName(req.FilePath);
                            req.KmsDetail.FileGuidName = fileGuidName;