add excel导入

This commit is contained in:
zeyu xu
2024-04-05 20:23:03 +08:00
parent 9f33b5009b
commit 8a0609e970
4 changed files with 186 additions and 5 deletions

View File

@@ -182,6 +182,12 @@
模型写死
</summary>
</member>
<member name="P:AntSK.Domain.Domain.Other.KMExcelHandler.StepName">
<inheritdoc />
</member>
<member name="M:AntSK.Domain.Domain.Other.KMExcelHandler.InvokeAsync(Microsoft.KernelMemory.Pipeline.DataPipeline,System.Threading.CancellationToken)">
<inheritdoc />
</member>
<member name="F:AntSK.Domain.Domain.Other.LLamaConfig.dicLLamaWeights">
<summary>
避免模型重复加载,本地缓存

View File

@@ -28,5 +28,7 @@ namespace AntSK.Domain.Domain.Model.Constant
历史聊天记录:{{ConversationSummaryPlugin.SummarizeConversation $history}}
--------------------------
用户问题: {{$input}}";
public const string KMExcelSplit = "*&antsk_excel&*";
}
}

View File

@@ -0,0 +1,156 @@
using AntSK.Domain.Domain.Model.Constant;
using Microsoft.Extensions.Logging;
using Microsoft.KernelMemory.AI.OpenAI;
using Microsoft.KernelMemory.Configuration;
using Microsoft.KernelMemory.DataFormats.Text;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Extensions;
using Microsoft.KernelMemory.Pipeline;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AntSK.Domain.Domain.Other
{
public class KMExcelHandler: IPipelineStepHandler
{
private readonly TextPartitioningOptions _options;
private readonly IPipelineOrchestrator _orchestrator;
private readonly ILogger<KMExcelHandler> _log;
private readonly TextChunker.TokenCounter _tokenCounter;
public KMExcelHandler(
string stepName,
IPipelineOrchestrator orchestrator,
TextPartitioningOptions? options = null,
ILogger<KMExcelHandler>? log = null)
{
this.StepName = stepName;
this._orchestrator = orchestrator;
this._options = options ?? new TextPartitioningOptions();
this._options.Validate();
this._log = log ?? DefaultLogger<KMExcelHandler>.Instance;
this._tokenCounter = DefaultGPTTokenizer.StaticCountTokens;
}
/// <inheritdoc />
public string StepName { get; }
/// <inheritdoc />
public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync(
DataPipeline pipeline, CancellationToken cancellationToken = default)
{
this._log.LogDebug("Partitioning text, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId);
if (pipeline.Files.Count == 0)
{
this._log.LogWarning("Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
return (true, pipeline);
}
foreach (DataPipeline.FileDetails uploadedFile in pipeline.Files)
{
// Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
Dictionary<string, DataPipeline.GeneratedFileDetails> newFiles = new();
foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> generatedFile in uploadedFile.GeneratedFiles)
{
var file = generatedFile.Value;
if (file.AlreadyProcessedBy(this))
{
this._log.LogTrace("File {0} already processed by this handler", file.Name);
continue;
}
// Partition only the original text
if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
{
this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
continue;
}
// Use a different partitioning strategy depending on the file type
List<string> partitions;
List<string> sentences;
BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
// Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
if (partitionContent.ToArray().Length == 0) { continue; }
switch (file.MimeType)
{
case MimeTypes.PlainText:
{
this._log.LogDebug("Partitioning text file {0}", file.Name);
string content = partitionContent.ToString();
var excelList = content.Split(KmsConstantcs.KMExcelSplit, StringSplitOptions.RemoveEmptyEntries).ToList();
sentences = excelList;
partitions = excelList;
break;
}
case MimeTypes.MarkDown:
{
this._log.LogDebug("Partitioning text file {0}", file.Name);
string content = partitionContent.ToString();
var excelList = content.Split(KmsConstantcs.KMExcelSplit, StringSplitOptions.RemoveEmptyEntries).ToList();
sentences = excelList;
partitions = excelList;
break;
}
default:
this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
// Don't partition other files
continue;
}
if (partitions.Count == 0) { continue; }
this._log.LogDebug("Saving {0} file partitions", partitions.Count);
for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
{
// TODO: turn partitions in objects with more details, e.g. page number
string text = partitions[partitionNumber];
int sectionNumber = 0; // TODO: use this to store the page number (if any)
BinaryData textData = new(text);
int tokenCount = this._tokenCounter(text);
this._log.LogDebug("Partition size: {0} tokens", tokenCount);
var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
var destFileDetails = new DataPipeline.GeneratedFileDetails
{
Id = Guid.NewGuid().ToString("N"),
ParentId = uploadedFile.Id,
Name = destFile,
Size = text.Length,
MimeType = MimeTypes.PlainText,
ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
PartitionNumber = partitionNumber,
SectionNumber = sectionNumber,
Tags = pipeline.Tags,
ContentSHA256 = textData.CalculateSHA256(),
};
newFiles.Add(destFile, destFileDetails);
destFileDetails.MarkProcessedBy(this);
}
file.MarkProcessedBy(this);
}
// Add new files to pipeline status
foreach (var file in newFiles)
{
uploadedFile.GeneratedFiles.Add(file.Key, file.Value);
}
}
return (true, pipeline);
}
}
}

View File

@@ -3,8 +3,11 @@ using AntSK.Domain.Domain.Interface;
using AntSK.Domain.Domain.Model;
using AntSK.Domain.Domain.Model.Constant;
using AntSK.Domain.Domain.Model.Excel;
using AntSK.Domain.Domain.Other;
using AntSK.Domain.Repositories;
using Microsoft.KernelMemory;
using Microsoft.KernelMemory.Handlers;
using System.Text;
namespace AntSK.Domain.Domain.Service
{
@@ -68,14 +71,28 @@ namespace AntSK.Domain.Domain.Service
case ImportType.Excel:
using (var fs = File.OpenRead(req.FilePath))
{
var excelList= ExeclHelper.ExcelToList<KMSExcelModel>(fs);
var excelList= ExeclHelper.ExcelToList<KMSExcelModel>(fs);
_memory.Orchestrator.AddHandler<TextExtractionHandler>("extract_text");
_memory.Orchestrator.AddHandler<KMExcelHandler>("antsk_excel_split");
_memory.Orchestrator.AddHandler<GenerateEmbeddingsHandler>("generate_embeddings");
_memory.Orchestrator.AddHandler<SaveRecordsHandler>("save_memory_records");
StringBuilder text = new StringBuilder();
foreach (var item in excelList)
{
var text = @$"Question:{item.Question}{Environment.NewLine} Answer:{item.Answer}";
var importResult = _memory.ImportTextAsync(text, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } }
, index: KmsConstantcs.KmsIndex).Result;
text.AppendLine(@$"Question:{item.Question}{Environment.NewLine}Answer:{item.Answer}{KmsConstantcs.KMExcelSplit}");
}
var testList = _kMService.GetDocumentByFileID(km.Id, fileid).Result;
var importResult = _memory.ImportTextAsync(text.ToString(), fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } }
, index: KmsConstantcs.KmsIndex,
steps: new[]
{
"extract_text",
"antsk_excel_split",
"generate_embeddings",
"save_memory_records"
}
).Result;
req.KmsDetail.FileName = req.FileName;
string fileGuidName = Path.GetFileName(req.FilePath);
req.KmsDetail.FileGuidName = fileGuidName;