mirror of
https://github.com/AIDotNet/AntSK.git
synced 2026-02-17 22:10:14 +08:00
add excel导入
This commit is contained in:
@@ -182,6 +182,12 @@
|
||||
模型写死
|
||||
</summary>
|
||||
</member>
|
||||
<member name="P:AntSK.Domain.Domain.Other.KMExcelHandler.StepName">
|
||||
<inheritdoc />
|
||||
</member>
|
||||
<member name="M:AntSK.Domain.Domain.Other.KMExcelHandler.InvokeAsync(Microsoft.KernelMemory.Pipeline.DataPipeline,System.Threading.CancellationToken)">
|
||||
<inheritdoc />
|
||||
</member>
|
||||
<member name="F:AntSK.Domain.Domain.Other.LLamaConfig.dicLLamaWeights">
|
||||
<summary>
|
||||
避免模型重复加载,本地缓存
|
||||
|
||||
@@ -28,5 +28,7 @@ namespace AntSK.Domain.Domain.Model.Constant
|
||||
历史聊天记录:{{ConversationSummaryPlugin.SummarizeConversation $history}}
|
||||
--------------------------
|
||||
用户问题: {{$input}}";
|
||||
|
||||
public const string KMExcelSplit = "*&antsk_excel&*";
|
||||
}
|
||||
}
|
||||
|
||||
156
src/AntSK.Domain/Domain/Other/KMExcelHandler.cs
Normal file
156
src/AntSK.Domain/Domain/Other/KMExcelHandler.cs
Normal file
@@ -0,0 +1,156 @@
|
||||
using AntSK.Domain.Domain.Model.Constant;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.KernelMemory.AI.OpenAI;
|
||||
using Microsoft.KernelMemory.Configuration;
|
||||
using Microsoft.KernelMemory.DataFormats.Text;
|
||||
using Microsoft.KernelMemory.Diagnostics;
|
||||
using Microsoft.KernelMemory.Extensions;
|
||||
using Microsoft.KernelMemory.Pipeline;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace AntSK.Domain.Domain.Other
|
||||
{
|
||||
public class KMExcelHandler: IPipelineStepHandler
|
||||
{
|
||||
private readonly TextPartitioningOptions _options;
|
||||
private readonly IPipelineOrchestrator _orchestrator;
|
||||
private readonly ILogger<KMExcelHandler> _log;
|
||||
private readonly TextChunker.TokenCounter _tokenCounter;
|
||||
|
||||
public KMExcelHandler(
|
||||
string stepName,
|
||||
IPipelineOrchestrator orchestrator,
|
||||
TextPartitioningOptions? options = null,
|
||||
ILogger<KMExcelHandler>? log = null)
|
||||
{
|
||||
this.StepName = stepName;
|
||||
this._orchestrator = orchestrator;
|
||||
this._options = options ?? new TextPartitioningOptions();
|
||||
this._options.Validate();
|
||||
|
||||
this._log = log ?? DefaultLogger<KMExcelHandler>.Instance;
|
||||
this._tokenCounter = DefaultGPTTokenizer.StaticCountTokens;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string StepName { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync(
|
||||
DataPipeline pipeline, CancellationToken cancellationToken = default)
|
||||
{
|
||||
this._log.LogDebug("Partitioning text, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId);
|
||||
|
||||
if (pipeline.Files.Count == 0)
|
||||
{
|
||||
this._log.LogWarning("Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId);
|
||||
return (true, pipeline);
|
||||
}
|
||||
|
||||
foreach (DataPipeline.FileDetails uploadedFile in pipeline.Files)
|
||||
{
|
||||
// Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
|
||||
Dictionary<string, DataPipeline.GeneratedFileDetails> newFiles = new();
|
||||
|
||||
foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> generatedFile in uploadedFile.GeneratedFiles)
|
||||
{
|
||||
var file = generatedFile.Value;
|
||||
if (file.AlreadyProcessedBy(this))
|
||||
{
|
||||
this._log.LogTrace("File {0} already processed by this handler", file.Name);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Partition only the original text
|
||||
if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
|
||||
{
|
||||
this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Use a different partitioning strategy depending on the file type
|
||||
List<string> partitions;
|
||||
List<string> sentences;
|
||||
BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
|
||||
if (partitionContent.ToArray().Length == 0) { continue; }
|
||||
|
||||
switch (file.MimeType)
|
||||
{
|
||||
case MimeTypes.PlainText:
|
||||
{
|
||||
this._log.LogDebug("Partitioning text file {0}", file.Name);
|
||||
string content = partitionContent.ToString();
|
||||
var excelList = content.Split(KmsConstantcs.KMExcelSplit, StringSplitOptions.RemoveEmptyEntries).ToList();
|
||||
sentences = excelList;
|
||||
partitions = excelList;
|
||||
break;
|
||||
}
|
||||
|
||||
case MimeTypes.MarkDown:
|
||||
{
|
||||
this._log.LogDebug("Partitioning text file {0}", file.Name);
|
||||
string content = partitionContent.ToString();
|
||||
var excelList = content.Split(KmsConstantcs.KMExcelSplit, StringSplitOptions.RemoveEmptyEntries).ToList();
|
||||
sentences = excelList;
|
||||
partitions = excelList;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
|
||||
// Don't partition other files
|
||||
continue;
|
||||
}
|
||||
|
||||
if (partitions.Count == 0) { continue; }
|
||||
|
||||
this._log.LogDebug("Saving {0} file partitions", partitions.Count);
|
||||
for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
|
||||
{
|
||||
// TODO: turn partitions in objects with more details, e.g. page number
|
||||
string text = partitions[partitionNumber];
|
||||
int sectionNumber = 0; // TODO: use this to store the page number (if any)
|
||||
BinaryData textData = new(text);
|
||||
|
||||
int tokenCount = this._tokenCounter(text);
|
||||
this._log.LogDebug("Partition size: {0} tokens", tokenCount);
|
||||
|
||||
var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
|
||||
await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var destFileDetails = new DataPipeline.GeneratedFileDetails
|
||||
{
|
||||
Id = Guid.NewGuid().ToString("N"),
|
||||
ParentId = uploadedFile.Id,
|
||||
Name = destFile,
|
||||
Size = text.Length,
|
||||
MimeType = MimeTypes.PlainText,
|
||||
ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
|
||||
PartitionNumber = partitionNumber,
|
||||
SectionNumber = sectionNumber,
|
||||
Tags = pipeline.Tags,
|
||||
ContentSHA256 = textData.CalculateSHA256(),
|
||||
};
|
||||
newFiles.Add(destFile, destFileDetails);
|
||||
destFileDetails.MarkProcessedBy(this);
|
||||
}
|
||||
|
||||
file.MarkProcessedBy(this);
|
||||
}
|
||||
|
||||
// Add new files to pipeline status
|
||||
foreach (var file in newFiles)
|
||||
{
|
||||
uploadedFile.GeneratedFiles.Add(file.Key, file.Value);
|
||||
}
|
||||
}
|
||||
|
||||
return (true, pipeline);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,8 +3,11 @@ using AntSK.Domain.Domain.Interface;
|
||||
using AntSK.Domain.Domain.Model;
|
||||
using AntSK.Domain.Domain.Model.Constant;
|
||||
using AntSK.Domain.Domain.Model.Excel;
|
||||
using AntSK.Domain.Domain.Other;
|
||||
using AntSK.Domain.Repositories;
|
||||
using Microsoft.KernelMemory;
|
||||
using Microsoft.KernelMemory.Handlers;
|
||||
using System.Text;
|
||||
|
||||
namespace AntSK.Domain.Domain.Service
|
||||
{
|
||||
@@ -68,14 +71,28 @@ namespace AntSK.Domain.Domain.Service
|
||||
case ImportType.Excel:
|
||||
using (var fs = File.OpenRead(req.FilePath))
|
||||
{
|
||||
var excelList= ExeclHelper.ExcelToList<KMSExcelModel>(fs);
|
||||
var excelList= ExeclHelper.ExcelToList<KMSExcelModel>(fs);
|
||||
|
||||
_memory.Orchestrator.AddHandler<TextExtractionHandler>("extract_text");
|
||||
_memory.Orchestrator.AddHandler<KMExcelHandler>("antsk_excel_split");
|
||||
_memory.Orchestrator.AddHandler<GenerateEmbeddingsHandler>("generate_embeddings");
|
||||
_memory.Orchestrator.AddHandler<SaveRecordsHandler>("save_memory_records");
|
||||
|
||||
StringBuilder text = new StringBuilder();
|
||||
foreach (var item in excelList)
|
||||
{
|
||||
var text = @$"Question:{item.Question}{Environment.NewLine} Answer:{item.Answer}";
|
||||
var importResult = _memory.ImportTextAsync(text, fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } }
|
||||
, index: KmsConstantcs.KmsIndex).Result;
|
||||
text.AppendLine(@$"Question:{item.Question}{Environment.NewLine}Answer:{item.Answer}{KmsConstantcs.KMExcelSplit}");
|
||||
}
|
||||
var testList = _kMService.GetDocumentByFileID(km.Id, fileid).Result;
|
||||
var importResult = _memory.ImportTextAsync(text.ToString(), fileid, new TagCollection() { { KmsConstantcs.KmsIdTag, req.KmsId } }
|
||||
, index: KmsConstantcs.KmsIndex,
|
||||
steps: new[]
|
||||
{
|
||||
"extract_text",
|
||||
"antsk_excel_split",
|
||||
"generate_embeddings",
|
||||
"save_memory_records"
|
||||
}
|
||||
).Result;
|
||||
req.KmsDetail.FileName = req.FileName;
|
||||
string fileGuidName = Path.GetFileName(req.FilePath);
|
||||
req.KmsDetail.FileGuidName = fileGuidName;
|
||||
|
||||
Reference in New Issue
Block a user