diff --git a/.gitignore b/.gitignore index 4dedb1d..5e630e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,367 @@ -/.vscode -/bin +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Oo]ut/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd +/PipelineBot/appsettings.json + /mystem.exe -/obj /Test \ No newline at end of file diff --git a/Examples/MyStemExampleConsole/MyStemExampleConsole.csproj b/Examples/MyStemExampleConsole/MyStemExampleConsole.csproj new file mode 100644 index 0000000..d8e51b4 --- /dev/null +++ b/Examples/MyStemExampleConsole/MyStemExampleConsole.csproj @@ -0,0 +1,24 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + + + + + + PreserveNewest + + + + diff --git a/Examples/MyStemExampleConsole/Program.cs b/Examples/MyStemExampleConsole/Program.cs new file mode 100644 index 0000000..d395703 --- /dev/null +++ b/Examples/MyStemExampleConsole/Program.cs @@ -0,0 +1,36 @@ +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using MyStemSharpness.Extensions; +using MyStemSharpness.Interfaces; +using System.Diagnostics; + + +var configuration = new ConfigurationBuilder() + .AddJsonFile("appsettings.json", optional: false) + .Build(); + +var services = new ServiceCollection(); +services.AddFastMyStem(configuration); + +var serviceProvider = services.BuildServiceProvider(); + + + +var stem = serviceProvider.GetRequiredService(); +IEnumerable inputs = ["Двигатель башни колонки", "!!!!", "Тестовъ три", "Тестовых восемь тысяч", "Где деньги Либовский?" ]; + +for (int i = 0; i < 100000; i++) +{ + inputs = inputs.Append(i.ToString()); +} + +Stopwatch stopwatch = Stopwatch.StartNew(); +foreach (var input in inputs) +{ + var result = stem.ParseAnalysis(input); + Console.WriteLine($"{input} -> {result}"); +} + +stopwatch.Stop(); +Console.WriteLine($"Time: {stopwatch.ElapsedMilliseconds} ms"); +Console.WriteLine($"Total memory: {Process.GetCurrentProcess().WorkingSet64 / 1024 / 1024} MB"); \ No newline at end of file diff --git a/Examples/MyStemExampleConsole/appsettings.json b/Examples/MyStemExampleConsole/appsettings.json new file mode 100644 index 0000000..e01cd9c --- /dev/null +++ b/Examples/MyStemExampleConsole/appsettings.json @@ -0,0 +1,11 @@ +{ + "MyStemOptions": { + "PathToMyStem": "mystem.exe", + "CopyInputToOutput": true, + "PrintGrammaticalInformation": true, + "Encoding": "utf-8", + "TimeoutMs": 100, + "PrintOnlyLemmasAndGrammemes": true, + "Format": "json" + } +} diff --git a/MyStemSharpness.csproj b/MyStemSharpness.csproj deleted file mode 100644 index 4f5dc22..0000000 --- a/MyStemSharpness.csproj +++ /dev/null @@ -1,23 +0,0 @@ - - - - Library - net6.0 - enable - enable - - - MyStemSharpness - 1.1.0 - paralax034 - MyStem from Yandex for C# - © paralax034 2025 - MIT - https://github.com/Scream034/MyStemSharpness - https://github.com/Scream034/MyStemSharpness.git - git - mystem;nlp;russian;linguistics;ml - true - - - \ No newline at end of file diff --git a/MyStemSharpness.sln b/MyStemSharpness.sln index b9b1acc..87e43eb 100644 --- a/MyStemSharpness.sln +++ b/MyStemSharpness.sln @@ -3,7 +3,11 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.5.002.0 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "MyStemSharpness", "MyStemSharpness.csproj", "{E68C4B8E-9A9E-4085-8451-FC8FA967180C}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "MyStemSharpness", "MyStemSharpness\MyStemSharpness.csproj", "{E68C4B8E-9A9E-4085-8451-FC8FA967180C}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Examples", "Examples", "{36D591C7-65C7-A0D1-1CBC-10CDE441BDC8}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MyStemExampleConsole", "Examples\MyStemExampleConsole\MyStemExampleConsole.csproj", "{F46F8F01-CEAA-4794-9161-81B1667C6157}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -15,10 +19,17 @@ Global {E68C4B8E-9A9E-4085-8451-FC8FA967180C}.Debug|Any CPU.Build.0 = Debug|Any CPU {E68C4B8E-9A9E-4085-8451-FC8FA967180C}.Release|Any CPU.ActiveCfg = Release|Any CPU {E68C4B8E-9A9E-4085-8451-FC8FA967180C}.Release|Any CPU.Build.0 = Release|Any CPU + {F46F8F01-CEAA-4794-9161-81B1667C6157}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F46F8F01-CEAA-4794-9161-81B1667C6157}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F46F8F01-CEAA-4794-9161-81B1667C6157}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F46F8F01-CEAA-4794-9161-81B1667C6157}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {F46F8F01-CEAA-4794-9161-81B1667C6157} = {36D591C7-65C7-A0D1-1CBC-10CDE441BDC8} + EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {0514970C-D294-48DF-94FD-1CBBF14594EA} EndGlobalSection diff --git a/MyStem/MyStemOptions.cs b/MyStemSharpness/Configuration/MyStemOptions.cs similarity index 83% rename from MyStem/MyStemOptions.cs rename to MyStemSharpness/Configuration/MyStemOptions.cs index 24b35ba..533eded 100644 --- a/MyStem/MyStemOptions.cs +++ b/MyStemSharpness/Configuration/MyStemOptions.cs @@ -1,4 +1,4 @@ -namespace MyStem; +namespace MyStemSharpness.Configuration; /// /// Represents the command-line options for the MyStem executable. @@ -8,7 +8,7 @@ public sealed class MyStemOptions /// /// The path to the MyStem executable. /// - public static string PathToMyStem { get; set; } = "mystem.exe"; + public string PathToMyStem { get; set; } = "mystem.exe"; /// /// Enables line-by-line mode; each word is printed on a new line. @@ -91,6 +91,32 @@ public sealed class MyStemOptions /// public bool PrintLemmaWeight { get; set; } + /// + /// The default timeout for reading data from the MyStem process. + /// + public int TimeoutMs { get; set; } = 50; + + /// + /// Factor used to estimate the initial total buffer size for reading the output. + /// + public float TotalBufferFactorSize { get; set; } = 3.5f; + + /// + /// Factor used to estimate the initial step buffer size for reading the output. + /// + public float StepBufferFactorSize { get; set; } = 2.5f; + + /// + /// The string that marks the end of the input for MyStem. + /// + public string EndString { get; set; } = "\nъъ"; + + /// + /// The string that is replaced with an empty string in the output. + /// + public string EndReplaceString { get; set; } = "ъъ??\r\n"; + + /// /// Gets the command-line arguments string based on the current options. /// diff --git a/MyStemSharpness/Extensions/DependencyInjection.cs b/MyStemSharpness/Extensions/DependencyInjection.cs new file mode 100644 index 0000000..610c683 --- /dev/null +++ b/MyStemSharpness/Extensions/DependencyInjection.cs @@ -0,0 +1,60 @@ +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; +using MyStemSharpness.Configuration; +using MyStemSharpness.Implementations; +using MyStemSharpness.Interfaces; + +namespace MyStemSharpness.Extensions; + +public static class DependencyInjection +{ + + public static IServiceCollection AddMyStemOptions( + this IServiceCollection services, IConfiguration configuration) + { + services.AddOptions() + .Bind(configuration.GetSection(nameof(MyStemOptions))); + return services; + } + + public static IServiceCollection AddMyStem( + this IServiceCollection services, IConfiguration configuration) + { + services.AddScoped(); + + services.AddMyStemOptions(configuration); + return services; + } + + public static IServiceCollection AddMyStem( + this IServiceCollection services, MyStemOptions options) + { + services.AddScoped(); + + services.AddSingleton(Options.Create(options)); + + return services; + } + + + public static IServiceCollection AddFastMyStem( + this IServiceCollection services, IConfiguration configuration) + { + services.AddScoped(); + + services.AddMyStemOptions(configuration); + return services; + } + + + public static IServiceCollection AddFastMyStem( + this IServiceCollection services, MyStemOptions options) + { + services.AddScoped(); + + services.AddSingleton(Options.Create(options)); + + return services; + } +} diff --git a/MyStemSharpness/Extensions/MyStemExtension.cs b/MyStemSharpness/Extensions/MyStemExtension.cs new file mode 100644 index 0000000..1bc4ca5 --- /dev/null +++ b/MyStemSharpness/Extensions/MyStemExtension.cs @@ -0,0 +1,36 @@ +using MyStemSharpness.Interfaces; +using MyStemSharpness.Models; +using System.Text.Json; + +namespace MyStemSharpness.Extensions; + +public static class MyStemExtension +{ + private static JsonSerializerOptions _jsonOptions = new JsonSerializerOptions + { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + PropertyNameCaseInsensitive = true + }; +public static List ParseAnalysis(this IMyStem myStem, string text) + { + var end = myStem.Options.Value.EndString.Trim(); + + for (var i = 0; i < 3; i++) + { + try + { + var json = myStem.Analysis(text); + var lines = json.Split("\n"); + if (lines.Length == 1) + return JsonSerializer.Deserialize>(lines[0], _jsonOptions); + if (lines.Length == 2) + return JsonSerializer.Deserialize>(lines[0], _jsonOptions); + if (lines.Length == 3) + return JsonSerializer.Deserialize>(lines[1], _jsonOptions); + } + catch{} + } + return null; + + } +} diff --git a/MyStem/FastMyStem.cs b/MyStemSharpness/Implementations/FastMyStem.cs similarity index 69% rename from MyStem/FastMyStem.cs rename to MyStemSharpness/Implementations/FastMyStem.cs index 5792ab8..0bce673 100644 --- a/MyStem/FastMyStem.cs +++ b/MyStemSharpness/Implementations/FastMyStem.cs @@ -1,41 +1,23 @@ -namespace MyStem; +namespace MyStemSharpness.Implementations; +using Microsoft.Extensions.Options; +using MyStemSharpness.Configuration; +using MyStemSharpness.Interfaces; using System; using System.Diagnostics; using System.IO; using System.Text; +using System.Text.Json; +using System.Threading; /// /// Cannot be used in a multithreaded environment! A class for interacting with the MyStem executable. /// -public sealed class FastMyStem : IDisposable +public sealed class FastMyStem : IMyStem { - /// - /// The default timeout for reading data from the MyStem process. - /// - public static readonly int TimeoutMs = 50; - - /// - /// Factor used to estimate the initial total buffer size for reading the output. - /// - public static float TotalBufferFactorSize = 3.5f; - - /// - /// Factor used to estimate the initial step buffer size for reading the output. - /// - public static float StepBufferFactorSize = 2.5f; - - /// - /// The string that marks the end of the input for MyStem. - /// - public static string EndString = "\nъъ"; - - /// - /// The string that is replaced with an empty string in the output. - /// - public static string EndReplaceString = "ъъ??\r\n"; private static readonly Encoding encoding = Encoding.UTF8; + private readonly SemaphoreSlim _processLock = new(1, 1); /// /// The process instance for the MyStem executable. @@ -50,16 +32,17 @@ public sealed class FastMyStem : IDisposable /// /// The options to configure the MyStem process. /// - public MyStemOptions Options { get; } + public IOptions Options { get; } /// /// Initializes a new instance of the class with the specified options. Requires the LineByLine option to be set to true. /// /// The MyStem options to use. - public FastMyStem(MyStemOptions? options = null) + public FastMyStem(IOptions? options = null) { - Options = options ?? new MyStemOptions(); - Options.LineByLine = true; // Required for stream reading + Options = options ?? Microsoft.Extensions.Options.Options.Create(new MyStemOptions()); + //Options.Value.LineByLine = true; + // Required for stream reading } /// @@ -70,13 +53,20 @@ private void InitializeProcess() { if (mystemProcess == null || mystemProcess.HasExited) { - mystemProcess?.Dispose(); + + try + { + mystemProcess?.Kill(); + mystemProcess?.Dispose(); + } + catch { } + mystemProcess = new Process { StartInfo = new ProcessStartInfo { - FileName = MyStemOptions.PathToMyStem, - Arguments = Options.GetArguments(), + FileName = Options.Value.PathToMyStem, + Arguments = Options.Value.GetArguments(), UseShellExecute = false, RedirectStandardInput = true, RedirectStandardOutput = true, @@ -101,17 +91,23 @@ private void InitializeProcess() /// The analysis result from MyStem. /// If the MyStem executable is not found at the specified path. /// If an error occurs during the MyStem analysis. - public string MultiAnalysis(string text) + public string Analysis(string text) { try { + _processLock.Wait(); InitializeProcess(); + return GetResults(text); } catch (Exception ex) { throw new FormatException($"Error during MyStem analysis. See logs for details. Text: '{text}'", ex); } + finally + { + _processLock.Release(); + } } /// @@ -122,19 +118,23 @@ public string MultiAnalysis(string text) private string GetResults(string inputText) { // Добавляем завершающую последовательность к входному тексту - inputText += EndString; + inputText += Options.Value.EndString; mystemProcess!.StandardInput.WriteLine(inputText); mystemProcess.StandardInput.Flush(); // Создаем MemoryStream для накопления всех байт - MemoryStream memoryStream = new((int)Math.Round(inputText.Length * TotalBufferFactorSize)); + MemoryStream memoryStream = new((int)Math.Round(inputText.Length * Options.Value.TotalBufferFactorSize)); // Размер буфера определяется как функция от размера входного текста - byte[] byteBuffer = new byte[(int)Math.Round(inputText.Length * StepBufferFactorSize)]; + var byteBuffer = new byte[(int)Math.Round(inputText.Length * Options.Value.StepBufferFactorSize)]; + + var totalBytesRead = 0; + var timeoutOccurred = false; - int totalBytesRead = 0; - bool timeoutOccurred = false; + byte[] stopBytes = encoding.GetBytes(Options.Value.EndString.Trim()); + int stopMatchIndex = 0; + var findStop = false; // Основной цикл чтения while (mystemProcess.StandardOutput.BaseStream.CanRead) { @@ -158,14 +158,19 @@ private string GetResults(string inputText) { // Асинхронное чтение с явным ожиданием asyncResult = mystemProcess.StandardOutput.BaseStream.BeginRead(byteBuffer, 0, byteBuffer.Length, null, null); - if (asyncResult.AsyncWaitHandle.WaitOne(TimeoutMs)) + if (asyncResult.AsyncWaitHandle.WaitOne(Options.Value.TimeoutMs)) { bytesRead = mystemProcess.StandardOutput.BaseStream.EndRead(asyncResult); } + } + // Гасим ошибку, пока не станет понятно как её решить + catch (ObjectDisposedException) + { + } finally { - asyncResult?.AsyncWaitHandle.Close(); + asyncResult?.AsyncWaitHandle?.Close(); } } @@ -179,9 +184,10 @@ private string GetResults(string inputText) // Записываем прочитанные байты в MemoryStream memoryStream.Write(byteBuffer, 0, bytesRead); - // Декодируем только что полученный кусок, чтобы проверить наличие завершающей последовательности "ъъ" - string chunk = encoding.GetString(byteBuffer, 0, bytesRead); - if (chunk.IndexOf("ъъ", StringComparison.Ordinal) >= 0) + if (!findStop) + findStop = ContainsStopSequence(byteBuffer, bytesRead, stopBytes, ref stopMatchIndex); + // Поиск стоп-последовательности с учетом пересечения буферов + if (findStop && byteBuffer.Contains((byte)'\n')) { break; } @@ -196,12 +202,39 @@ private string GetResults(string inputText) // Сбрасываем позицию в начале MemoryStream для декодирования memoryStream.Seek(0, SeekOrigin.Begin); // Декодируем все накопленные байты за один раз и заменяем специальную последовательность, если она есть - string result = encoding.GetString(memoryStream.ToArray()).Replace(EndReplaceString, string.Empty); + string result = encoding.GetString(memoryStream.ToArray()).Replace(Options.Value.EndReplaceString, string.Empty).Trim(); memoryStream.Dispose(); return result; } + + private bool ContainsStopSequence(byte[] buffer, int count, byte[] stopBytes, ref int matchIndex) + { + for (int i = 0; i < count; i++) + { + if (buffer[i] == stopBytes[matchIndex]) + { + matchIndex++; + if (matchIndex == stopBytes.Length) + { + return true; + } + } + else + { + matchIndex = 0; + // Если текущий байт совпадает с первым байтом стоп-последовательности, + // нужно проверить его снова + if (buffer[i] == stopBytes[0]) + { + matchIndex = 1; + } + } + } + return false; + } + /// /// Disposes of the resources used by the object. /// @@ -223,7 +256,6 @@ public void Dispose() } disposed = true; } - GC.SuppressFinalize(this); } /// diff --git a/MyStem/MyStem.cs b/MyStemSharpness/Implementations/MyStem.cs similarity index 85% rename from MyStem/MyStem.cs rename to MyStemSharpness/Implementations/MyStem.cs index 9b4102d..c20a588 100644 --- a/MyStem/MyStem.cs +++ b/MyStemSharpness/Implementations/MyStem.cs @@ -1,5 +1,8 @@ -namespace MyStem; +namespace MyStemSharpness.Implementations; +using Microsoft.Extensions.Options; +using MyStemSharpness.Configuration; +using MyStemSharpness.Interfaces; using System; using System.Diagnostics; using System.IO; @@ -11,7 +14,7 @@ namespace MyStem; /// /// This class is not recommended for use in multithreaded scenarios. /// -public sealed class MyStem : IDisposable +public sealed class MyStem : IMyStem { /// /// The process instance for the MyStem executable. @@ -26,15 +29,16 @@ public sealed class MyStem : IDisposable /// /// The options to configure the MyStem process. /// - public MyStemOptions Options { get; } + public IOptions Options { get; } /// /// Initializes a new instance of the class with the specified options. /// /// The MyStem options to use. - public MyStem(MyStemOptions? options = null) + public MyStem(IOptions? options = null) { - Options = options ?? new MyStemOptions(); + Options = options ?? Microsoft.Extensions.Options.Options.Create(new MyStemOptions()); + } /// @@ -49,8 +53,8 @@ public void Initialize() { StartInfo = new ProcessStartInfo { - FileName = MyStemOptions.PathToMyStem, - Arguments = Options.GetArguments(), + FileName = Options.Value.PathToMyStem, + Arguments = Options.Value.GetArguments(), UseShellExecute = false, RedirectStandardInput = true, RedirectStandardOutput = true, @@ -73,7 +77,7 @@ public void Initialize() /// If an error occurs during the MyStem analysis. public string Analysis(string text) { - if (!File.Exists(MyStemOptions.PathToMyStem)) + if (!File.Exists(Options.Value.PathToMyStem)) { throw new FileNotFoundException("Path to MyStem.exe is not valid!"); } diff --git a/MyStemSharpness/Interfaces/IMyStem.cs b/MyStemSharpness/Interfaces/IMyStem.cs new file mode 100644 index 0000000..35fbf34 --- /dev/null +++ b/MyStemSharpness/Interfaces/IMyStem.cs @@ -0,0 +1,18 @@ +using Microsoft.Extensions.Options; +using MyStemSharpness.Configuration; + +namespace MyStemSharpness.Interfaces; + +public interface IMyStem: IDisposable +{ + /// + /// Analyzes the given text using the MyStem executable in a single-threaded manner. + /// + /// The text to analyze. + /// The analysis result from MyStem. + /// If the MyStem executable is not found at the specified path. + /// If an error occurs during the MyStem analysis. + string Analysis(string text); + + internal IOptions Options { get; } +} diff --git a/MyStemSharpness/Models/EnrichedAnalysis.cs b/MyStemSharpness/Models/EnrichedAnalysis.cs new file mode 100644 index 0000000..cc14f54 --- /dev/null +++ b/MyStemSharpness/Models/EnrichedAnalysis.cs @@ -0,0 +1,8 @@ +namespace MyStemSharpness.Models; + +public record EnrichedAnalysis +{ + public string OriginalText { get; init; } = string.Empty; + public List Variants { get; init; } = new(); + public bool HasAnalysis => Variants.Count > 0; +} \ No newline at end of file diff --git a/MyStemSharpness/Models/Enums/Animacy.cs b/MyStemSharpness/Models/Enums/Animacy.cs new file mode 100644 index 0000000..7c5cd2c --- /dev/null +++ b/MyStemSharpness/Models/Enums/Animacy.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Animacy +{ + [JsonPropertyName("anim")] + Animate, + + [JsonPropertyName("inan")] + Inanimate +} \ No newline at end of file diff --git a/MyStemSharpness/Models/Enums/Aspect.cs b/MyStemSharpness/Models/Enums/Aspect.cs new file mode 100644 index 0000000..cd91766 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Aspect.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Aspect +{ + [JsonPropertyName("perf")] + Perfective, + + [JsonPropertyName("impf")] + Imperfective +} \ No newline at end of file diff --git a/MyStemSharpness/Models/Enums/Case.cs b/MyStemSharpness/Models/Enums/Case.cs new file mode 100644 index 0000000..e23c9c2 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Case.cs @@ -0,0 +1,42 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Case +{ + [JsonPropertyName("nomn")] + Nominative, + + [JsonPropertyName("gent")] + Genitive, + + [JsonPropertyName("datv")] + Dative, + + [JsonPropertyName("accs")] + Accusative, + + [JsonPropertyName("ablt")] + Instrumental, + + [JsonPropertyName("loct")] + Prepositional, + + [JsonPropertyName("voct")] + Vocative, + + [JsonPropertyName("gen1")] + FirstGenitive, + + [JsonPropertyName("gen2")] + SecondGenitive, + + [JsonPropertyName("acc2")] + SecondAccusative, + + [JsonPropertyName("loc1")] + FirstPrepositional, + + [JsonPropertyName("loc2")] + SecondPrepositional +} \ No newline at end of file diff --git a/MyStemSharpness/Models/Enums/Degree.cs b/MyStemSharpness/Models/Enums/Degree.cs new file mode 100644 index 0000000..dfb9369 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Degree.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Degree +{ + [JsonPropertyName("comp")] + Comparative, + + [JsonPropertyName("supr")] + Superlative +} \ No newline at end of file diff --git a/MyStemSharpness/Models/Enums/Gender.cs b/MyStemSharpness/Models/Enums/Gender.cs new file mode 100644 index 0000000..78c0b51 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Gender.cs @@ -0,0 +1,18 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Gender +{ + [JsonPropertyName("masc")] + Masculine, + + [JsonPropertyName("femn")] + Feminine, + + [JsonPropertyName("neut")] + Neuter, + + [JsonPropertyName("ms-f")] + CommonGender +} diff --git a/MyStemSharpness/Models/Enums/Mood.cs b/MyStemSharpness/Models/Enums/Mood.cs new file mode 100644 index 0000000..219e5d9 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Mood.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Mood +{ + [JsonPropertyName("indc")] + Indicative, + + [JsonPropertyName("impr")] + Imperative +} \ No newline at end of file diff --git a/MyStemSharpness/Models/Enums/Number.cs b/MyStemSharpness/Models/Enums/Number.cs new file mode 100644 index 0000000..dea93e3 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Number.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Number +{ + [JsonPropertyName("sing")] + Singular, + + [JsonPropertyName("plur")] + Plural +} diff --git a/MyStemSharpness/Models/Enums/PartOfSpeech.cs b/MyStemSharpness/Models/Enums/PartOfSpeech.cs new file mode 100644 index 0000000..a579b31 --- /dev/null +++ b/MyStemSharpness/Models/Enums/PartOfSpeech.cs @@ -0,0 +1,48 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum PartOfSpeech +{ + [JsonPropertyName("A")] + Adjective, + + [JsonPropertyName("ADV")] + Adverb, + + [JsonPropertyName("ADVPRO")] + PronominalAdverb, + + [JsonPropertyName("ANUM")] + NumeralAdjective, + + [JsonPropertyName("APRO")] + PronominalAdjective, + + [JsonPropertyName("COM")] + PartOfCompound, + + [JsonPropertyName("CONJ")] + Conjunction, + + [JsonPropertyName("INTJ")] + Interjection, + + [JsonPropertyName("NUM")] + Numeral, + + [JsonPropertyName("PART")] + Particle, + + [JsonPropertyName("PR")] + Preposition, + + [JsonPropertyName("S")] + Noun, + + [JsonPropertyName("SPRO")] + PronounNoun, + + [JsonPropertyName("V")] + Verb +} diff --git a/MyStemSharpness/Models/Enums/Person.cs b/MyStemSharpness/Models/Enums/Person.cs new file mode 100644 index 0000000..de72d02 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Person.cs @@ -0,0 +1,15 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Person +{ + [JsonPropertyName("1per")] + First, + + [JsonPropertyName("2per")] + Second, + + [JsonPropertyName("3per")] + Third +} \ No newline at end of file diff --git a/MyStemSharpness/Models/Enums/Tense.cs b/MyStemSharpness/Models/Enums/Tense.cs new file mode 100644 index 0000000..ac1cb6a --- /dev/null +++ b/MyStemSharpness/Models/Enums/Tense.cs @@ -0,0 +1,15 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Tense +{ + [JsonPropertyName("pres")] + Present, + + [JsonPropertyName("past")] + Past, + + [JsonPropertyName("futr")] + Future +} diff --git a/MyStemSharpness/Models/Enums/Transitivity.cs b/MyStemSharpness/Models/Enums/Transitivity.cs new file mode 100644 index 0000000..1c23052 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Transitivity.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Transitivity +{ + [JsonPropertyName("tran")] + Transitive, + + [JsonPropertyName("intr")] + Intransitive +} diff --git a/MyStemSharpness/Models/Enums/Voice.cs b/MyStemSharpness/Models/Enums/Voice.cs new file mode 100644 index 0000000..fb85fe3 --- /dev/null +++ b/MyStemSharpness/Models/Enums/Voice.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models.Enums; + +public enum Voice +{ + [JsonPropertyName("actv")] + Active, + + [JsonPropertyName("pssv")] + Passive +} diff --git a/MyStemSharpness/Models/GrammarAnalysis.cs b/MyStemSharpness/Models/GrammarAnalysis.cs new file mode 100644 index 0000000..6fe91e6 --- /dev/null +++ b/MyStemSharpness/Models/GrammarAnalysis.cs @@ -0,0 +1,51 @@ +using MyStemSharpness.Models.Enums; +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models; + +public record GrammarAnalysis +{ + [JsonPropertyName("pos")] + public PartOfSpeech? PartOfSpeech { get; init; } + + [JsonPropertyName("gender")] + public Gender? Gender { get; init; } + + [JsonPropertyName("number")] + public Number? Number { get; init; } + + [JsonPropertyName("case")] + public Case? Case { get; init; } + + [JsonPropertyName("tense")] + public Tense? Tense { get; init; } + + [JsonPropertyName("voice")] + public Voice? Voice { get; init; } + + [JsonPropertyName("mood")] + public Mood? Mood { get; init; } + + [JsonPropertyName("aspect")] + public Aspect? Aspect { get; init; } + + [JsonPropertyName("animacy")] + public Animacy? Animacy { get; init; } + + [JsonPropertyName("person")] + public Person? Person { get; init; } + + [JsonPropertyName("degree")] + public Degree? Degree { get; init; } + + [JsonPropertyName("transitivity")] + public Transitivity? Transitivity { get; init; } + + // Дополнительные поля, которые могут присутствовать + [JsonPropertyName("invl")] + public bool? Involved { get; init; } + + [JsonPropertyName("RO")] + public string? Root { get; init; } +} + diff --git a/MyStemSharpness/Models/MyStemAnalysis.cs b/MyStemSharpness/Models/MyStemAnalysis.cs new file mode 100644 index 0000000..8eaef27 --- /dev/null +++ b/MyStemSharpness/Models/MyStemAnalysis.cs @@ -0,0 +1,15 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models; + +public record MyStemAnalysis +{ + [JsonPropertyName("text")] + public List TextAnalysis { get; init; } = new(); + + [JsonPropertyName("error")] + public string? Error { get; init; } + + [JsonPropertyName("version")] + public string? Version { get; init; } +} \ No newline at end of file diff --git a/MyStemSharpness/Models/MyStemBatchResult.cs b/MyStemSharpness/Models/MyStemBatchResult.cs new file mode 100644 index 0000000..7affdc8 --- /dev/null +++ b/MyStemSharpness/Models/MyStemBatchResult.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models; + +public record MyStemBatchResult +{ + [JsonPropertyName("result")] + public List Result { get; init; } = new(); + + [JsonPropertyName("error")] + public string? Error { get; init; } +} \ No newline at end of file diff --git a/MyStemSharpness/Models/MyStemWordResult.cs b/MyStemSharpness/Models/MyStemWordResult.cs new file mode 100644 index 0000000..7a7522f --- /dev/null +++ b/MyStemSharpness/Models/MyStemWordResult.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models; + +public record MyStemWordResult +{ + [JsonPropertyName("analysis")] + public List Analysis { get; init; } = new(); + + [JsonPropertyName("text")] + public string Text { get; init; } = string.Empty; +} diff --git a/MyStemSharpness/Models/ParsedVariant.cs b/MyStemSharpness/Models/ParsedVariant.cs new file mode 100644 index 0000000..66ca21b --- /dev/null +++ b/MyStemSharpness/Models/ParsedVariant.cs @@ -0,0 +1,11 @@ +using MyStemSharpness.Models.Enums; + +namespace MyStemSharpness.Models; + +public record ParsedVariant +{ + public string Lemma { get; init; } = string.Empty; + public PartOfSpeech? PartOfSpeech { get; init; } + public List GrammarFeatures { get; init; } = new(); + public double Confidence { get; init; } +} \ No newline at end of file diff --git a/MyStemSharpness/Models/TextAnalysis.cs b/MyStemSharpness/Models/TextAnalysis.cs new file mode 100644 index 0000000..c9d617c --- /dev/null +++ b/MyStemSharpness/Models/TextAnalysis.cs @@ -0,0 +1,12 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models; + +public record TextAnalysis +{ + [JsonPropertyName("text")] + public string Text { get; init; } = string.Empty; + + [JsonPropertyName("analysis")] + public List Analysis { get; init; } = new(); +} diff --git a/MyStemSharpness/Models/WordAnalysis.cs b/MyStemSharpness/Models/WordAnalysis.cs new file mode 100644 index 0000000..136b040 --- /dev/null +++ b/MyStemSharpness/Models/WordAnalysis.cs @@ -0,0 +1,32 @@ +using System.Text.Json.Serialization; + +namespace MyStemSharpness.Models; + +public record WordAnalysis +{ + [JsonPropertyName("lex")] + public string? Lemma { get; init; } + + [JsonPropertyName("gr")] + public string? RawGrammar { get; init; } + + [JsonPropertyName("wt")] + public double? Weight { get; init; } + + [JsonPropertyName("qual")] + public string? Quality { get; init; } + + // Разобранные грамматические характеристики + [JsonIgnore] + public GrammarAnalysis? Grammar => ParseGrammar(RawGrammar); + + private static GrammarAnalysis? ParseGrammar(string? rawGrammar) + { + if (string.IsNullOrEmpty(rawGrammar)) + return null; + + // Здесь можно реализовать парсинг строки грамматики + // MyStem возвращает грамматику в формате "S,femn,sing,nomn" + return new GrammarAnalysis(); + } +} diff --git a/MyStemSharpness/MyStemSharpness.csproj b/MyStemSharpness/MyStemSharpness.csproj new file mode 100644 index 0000000..be0edaf --- /dev/null +++ b/MyStemSharpness/MyStemSharpness.csproj @@ -0,0 +1,31 @@ + + + + Library + net8.0 + enable + enable + + + MyStemSharpness + 1.2.0 + paralax034 + MyStem from Yandex for C# + © paralax034 2025 + MIT + https://github.com/Scream034/MyStemSharpness + https://github.com/Scream034/MyStemSharpness.git + git + mystem;nlp;russian;linguistics;ml + true + + + + + + + + + + + \ No newline at end of file diff --git a/Test.cs b/Test.cs deleted file mode 100644 index 3d7d352..0000000 --- a/Test.cs +++ /dev/null @@ -1,26 +0,0 @@ -using System.Diagnostics; -using MyStem; - -public static class Test -{ - public static void Main() - { - FastMyStem stem = new(new() { PrintOnlyLemmasAndGrammemes = true }); - - List inputs = new List() {"Двигатель башни колонки", "!!!!", "Тестовъ три", "Тестовых восемь тысяч", "Где деньги Либовский?"}; - for (int i = 0; i < 100000; i++) - { - inputs.Add(i.ToString()); - } - Stopwatch stopwatch = Stopwatch.StartNew(); - foreach (var input in inputs) - { - var result = stem.MultiAnalysis(input); - Console.WriteLine($"{input} -> {result}"); - } - - stopwatch.Stop(); - Console.WriteLine($"Time: {stopwatch.ElapsedMilliseconds} ms"); - Console.WriteLine($"Total memory: {Process.GetCurrentProcess().WorkingSet64 / 1024 / 1024} MB"); - } -} \ No newline at end of file