Skip to content

Commit

Permalink
rearrange
Browse files Browse the repository at this point in the history
  • Loading branch information
russcam committed Apr 30, 2024
1 parent c7fde32 commit c11a0f3
Show file tree
Hide file tree
Showing 15 changed files with 2,179 additions and 2,075 deletions.
27 changes: 27 additions & 0 deletions Directory.Build.props
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">

<PropertyGroup>
<Authors>Russ Cam and Contributors</Authors>
<Copyright>Russ Cam</Copyright>
<PackageLicenseExpression>Apache-2.0</PackageLicenseExpression>
<RepositoryUrl>https://github.com/russcam/lingua-dotnet</RepositoryUrl>
<PackageProjectUrl>https://github.com/russcam/lingua-dotnet</PackageProjectUrl>
<PackageReleaseNotes>https://github.com/russcam/lingua-dotnet/releases</PackageReleaseNotes>
<PackageTags>lingua, language detection</PackageTags>
<SolutionRoot>$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), Lingua.sln))</SolutionRoot>
<!-- <AssemblyOriginatorKeyFile>$(SolutionRoot)build\keys\keypair.snk</AssemblyOriginatorKeyFile>-->
<!-- <PublicKey>97ab15215fab2a5d</PublicKey>-->
<LangVersion>latest</LangVersion>
<!-- <TreatWarningsAsErrors>true</TreatWarningsAsErrors>-->
<IsPackable>False</IsPackable>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<MinVerMinimumMajorMinor>1.0</MinVerMinimumMajorMinor>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="MinVer" Version="4.3.0" PrivateAssets="all"/>
</ItemGroup>

</Project>
12 changes: 11 additions & 1 deletion Lingua.sln
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,20 @@
Microsoft Visual Studio Solution File, Format Version 12.00
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lingua", "src\Lingua\Lingua.csproj", "{835F7E5D-80DB-40FC-9240-DE251FAE63DC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "UnicodeScriptGenerator", "src\UnicodeScriptGenerator\UnicodeScriptGenerator.csproj", "{77954C87-8D00-40FA-9617-DFAF3499DE9C}"
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "UnicodeScriptGenerator", "build\UnicodeScriptGenerator\UnicodeScriptGenerator.csproj", "{77954C87-8D00-40FA-9617-DFAF3499DE9C}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lingua.Tests", "tests\Lingua.Tests\Lingua.Tests.csproj", "{120D07F4-AAF1-4D00-BF86-48836F819C1F}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Root", "Solution Root", "{C4287977-4407-47E3-A452-57DB92BC5432}"
ProjectSection(SolutionItems) = preProject
.editorconfig = .editorconfig
.gitattributes = .gitattributes
.gitignore = .gitignore
global.json = global.json
README.md = README.md
Directory.Build.props = Directory.Build.props
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public static async Task Main(string[] args)
var firstLine = File.ReadLines(fileName).First();
var version = VersionRegex().Match(firstLine).Groups[1].Value;
var builder = new StringBuilder($@"using System.Globalization;
using static Lingua.UnicodeScript;
namespace Lingua;
Expand Down Expand Up @@ -56,7 +57,7 @@ public static class CharExtensions
for (var i = 0; i < orderedScripts.Count; i++)
{
builder.AppendLine(
$" 0x{orderedScripts[i].CodePointRange.Start.ToString("X4")}, // {orderedScripts[i].CodePointRange.Start.ToString("X4")}..{orderedScripts[i].CodePointRange.End.ToString("X4")}; {orderedScripts[i].Name}");
$" 0x{orderedScripts[i].CodePointRange.Start.ToString("X4")}, // {orderedScripts[i].CodePointRange.ToString()}; {orderedScripts[i].Name}");
}

builder.AppendLine(@" };
Expand All @@ -67,7 +68,7 @@ public static class CharExtensions
for (var i = 0; i < orderedScripts.Count; i++)
{
builder.AppendLine(
$" UnicodeScript.{orderedScripts[i].Name.Replace("_", "")},\t// {orderedScripts[i].CodePointRange.Start.ToString("X4")}..{orderedScripts[i].CodePointRange.End.ToString("X4")}");
$" {orderedScripts[i].Name.Replace("_", "")},\t// {orderedScripts[i].CodePointRange.ToString()}");
}

builder.AppendLine(@" };
Expand Down Expand Up @@ -118,18 +119,18 @@ private static string FindSolutionRoot()
$"Could not find solution root directory from the current directory {startDir}");
}

private static List<((int Start, int End)[] CodePointRanges, string Name)> ReadUnicodeScriptsFromFile(
private static List<(CodePointRange[] CodePointRanges, string Name)> ReadUnicodeScriptsFromFile(
string fileName)
{
using var stream = File.OpenRead(fileName);
using var reader = new UnicodeDataFileReader(stream, ';');
var unicodeScripts = new List<((int, int)[], string)>();
var unicodePointRanges = new List<(int, int)>(100);
var unicodeScripts = new List<(CodePointRange[], string)>();
var unicodePointRanges = new List<CodePointRange>(100);
string? name = null;

while (reader.MoveToNextLine())
{
var unicodeCodePointRange = ParseCodepointRange(reader.ReadField());
var unicodeCodePointRange = ParseCodepointRange(reader.ReadField()!);
var currentName = reader.ReadTrimmedField();
if (name == null || name.Equals(currentName, StringComparison.OrdinalIgnoreCase))
{
Expand All @@ -149,7 +150,7 @@ private static string FindSolutionRoot()
return unicodeScripts;
}

private static (int, int) ParseCodepointRange(string range)
private static CodePointRange ParseCodepointRange(string range)
{
int start, end;
var rangeSeparatorOffset = range.IndexOf("..", StringComparison.InvariantCulture);
Expand All @@ -166,18 +167,18 @@ private static (int, int) ParseCodepointRange(string range)
break;
}

return (start, end);
return new(start, end);
}

private static List<((int Start, int End) CodePointRange, string Name)> CreateCollapsedOrderedRange(
List<((int Start, int End)[] CodePointRanges, string Name)> unicodeScripts)
private static List<(CodePointRange CodePointRange, string Name)> CreateCollapsedOrderedRange(
List<(CodePointRange[] CodePointRanges, string Name)> unicodeScripts)
{
List<((int Start, int End) CodePointRange, string Name)> orderedScripts = unicodeScripts
List<(CodePointRange CodePointRange, string Name)> orderedScripts = unicodeScripts
.SelectMany(s => s.CodePointRanges.Select(range => (range, s.Name)))
.OrderBy(v => v.range.Start)
.ToList();

List<((int Start, int End) CodePointRange, string Name)> orderedScriptsWithUnknownInsertions =
List<(CodePointRange CodePointRange, string Name)> orderedScriptsWithUnknownInsertions =
new(orderedScripts.Count);
var currentRangeStart = orderedScripts[0].CodePointRange.Start;
var currentRangeEnd = orderedScripts[0].CodePointRange.End;
Expand All @@ -199,11 +200,11 @@ private static (int, int) ParseCodepointRange(string range)
else
{
// Add the current collapsed range to the result
orderedScriptsWithUnknownInsertions.Add(((currentRangeStart, currentRangeEnd), currentScriptName));
orderedScriptsWithUnknownInsertions.Add((new(currentRangeStart, currentRangeEnd), currentScriptName));

// If there's a gap between the current range's end and the next range's start, add an Unknown range
if (nextRangeStart > currentRangeEnd + 1)
orderedScriptsWithUnknownInsertions.Add(((currentRangeEnd + 1, nextRangeStart - 1), "Unknown"));
orderedScriptsWithUnknownInsertions.Add((new(currentRangeEnd + 1, nextRangeStart - 1), "Unknown"));

// Update currentRangeStart and currentRangeEnd
currentRangeStart = nextRangeStart;
Expand All @@ -214,12 +215,19 @@ private static (int, int) ParseCodepointRange(string range)

// Add the last collapsed range to the result
orderedScriptsWithUnknownInsertions
.Add(((currentRangeStart, currentRangeEnd), currentScriptName));
.Add((new(currentRangeStart, currentRangeEnd), currentScriptName));

// Add Unknown to cover the remaining range
orderedScriptsWithUnknownInsertions
.Add(((orderedScripts[^1].CodePointRange.End + 1, 0x10FFFF), "Unknown"));
.Add((new(orderedScripts[^1].CodePointRange.End + 1, 0x10FFFF), "Unknown"));

return orderedScriptsWithUnknownInsertions;
}
}

public readonly record struct CodePointRange(int Start, int End)
{
public override string ToString() => Start == End
? Start.ToString("X4")
: $"{Start:X4}..{End:X4}";
}
File renamed without changes.
Binary file added build/keys/keypair.snk
Binary file not shown.
Binary file added images/nuget-icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
28 changes: 28 additions & 0 deletions src/Directory.Build.props
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$([MSBuild]::GetPathOfFileAbove('Directory.Build.props', '$(MSBuildThisFileDirectory)../'))"/>
<PropertyGroup>
<IsPackable>true</IsPackable>
<!-- <SignAssembly>true</SignAssembly> -->
<PackageIcon>nuget-icon.png</PackageIcon>
<PackageReadmeFile>README.md</PackageReadmeFile>
<AllowedOutputExtensionsInPackageBuildOutputFolder>$(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb</AllowedOutputExtensionsInPackageBuildOutputFolder>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<DebugSymbols>true</DebugSymbols>
<PublishRepositoryUrl>true</PublishRepositoryUrl>
<EmbedUntrackedSources>true</EmbedUntrackedSources>
<ContinuousIntegrationBuild>true</ContinuousIntegrationBuild>
</PropertyGroup>

<ItemGroup>
<None Include="$(SolutionRoot)README.md" Pack="true" PackagePath="." />
<None Include="$(SolutionRoot)LICENSE" Pack="true" PackagePath="." />
<None Include="$(SolutionRoot)\images\nuget-icon.png" Pack="true" PackagePath="." />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.1.1" PrivateAssets="All"/>
<PackageReference Include="ConfigureAwaitChecker.Analyzer" Version="5.0.0.1" PrivateAssets="All" />
<PackageReference Include="SauceControl.InheritDoc" Version="1.3.0" PrivateAssets="All" />
</ItemGroup>
</Project>
42 changes: 39 additions & 3 deletions src/Lingua/Internal/Ngram.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@

namespace Lingua.Internal;

/// <summary>
/// A connected string of N items from a sample of text.
/// </summary>
public readonly struct Ngram : IEquatable<Ngram>
{
private readonly string _value;

/// <summary>
/// Initializes a new instance of <see cref="Ngram"/>
/// </summary>
/// <param name="value">The sample of text.</param>
/// <exception cref="ArgumentException">If <paramref name="value"/> is greater than 5</exception>
public Ngram(string value)
{
if (value.Length > 5)
Expand All @@ -14,25 +22,44 @@ public Ngram(string value)
_value = value;
}

/// <summary>
/// An Ngram of zero length
/// </summary>
public static readonly Ngram Zerogram = new("");

/// <summary>
/// Whether value is empty.
/// </summary>
public bool IsEmpty => _value.Length == 0;

/// <summary>
/// Gets the length of the value.
/// </summary>
public int Length => _value.Length;

/// <summary>
/// Decrements and returns a new instance of an <see cref="Ngram"/> that is a substring of this instance.
/// </summary>
/// <returns>A new instance of <see cref="Ngram"/></returns>
/// <exception cref="InvalidOperationException">If the Ngram has a length of 0.</exception>
public Ngram Dec() =>
_value.Length switch
{
0 => throw new Exception("Zerogram is ngram type of lowest order and can not be decremented"),
0 => throw new InvalidOperationException("Zerogram is ngram type of lowest order and can not be decremented"),
1 => Zerogram,
_ => new Ngram(_value.Substring(0, _value.Length - 1))
};

/// <inheritdoc />
public override string ToString() => _value;

/// <inheritdoc />
public bool Equals(Ngram other) => _value == other._value;

/// <inheritdoc />
public override bool Equals(object? obj) => obj is Ngram other && Equals(other);

/// <inheritdoc />
public override int GetHashCode() => _value.GetHashCode();

public IEnumerable<Ngram> RangeOfLowerOrderNGrams() =>
Expand All @@ -50,7 +77,7 @@ internal static string GetNgramNameByLength(int ngramLength) =>
};
}

public class NgramEnumerable : IEnumerable<Ngram>
public struct NgramEnumerable : IEnumerable<Ngram>
{
private readonly Ngram _start;

Expand All @@ -62,18 +89,24 @@ public NgramEnumerable(Ngram start, Ngram endInclusive)
_start = start;
}

/// <inheritdoc />
public IEnumerator<Ngram> GetEnumerator() => new NgramEnumerator(_start);

IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
}

public class NgramEnumerator : IEnumerator<Ngram>
public struct NgramEnumerator : IEnumerator<Ngram>
{
private readonly Ngram _start;
private Ngram? _current;

/// <summary>
/// Initializes a new instance of <see cref="NgramEnumerator"/>
/// </summary>
/// <param name="start"></param>
public NgramEnumerator(Ngram start) => _start = start;

/// <inheritdoc />
public bool MoveNext()
{
if (_current is null)
Expand All @@ -89,12 +122,15 @@ public bool MoveNext()
return !_current.Value.IsEmpty;
}

/// <inheritdoc />
public void Reset() => _current = null;

/// <inheritdoc />
public Ngram Current => _current!.Value;

object IEnumerator.Current => Current;

/// <inheritdoc />
public void Dispose()
{
}
Expand Down
19 changes: 8 additions & 11 deletions src/Lingua/Internal/TestDataLanguageModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,25 @@

namespace Lingua.Internal;

public record TestDataLanguageModel(HashSet<Ngram> Ngrams)
public partial record struct TestDataLanguageModel(HashSet<Ngram> Ngrams)
{
private static readonly Regex LetterRegex = new("^\\p{L}+$", RegexOptions.Compiled);
[GeneratedRegex("^\\p{L}+$")]
private static partial Regex LetterRegex();

public static TestDataLanguageModel FromText(string text, int ngramLength)
{
if (ngramLength is < 1 or > 5)
{
throw new ArgumentException($"ngram length {ngramLength} is not in range 1..5");
}
throw new ArgumentOutOfRangeException(nameof(ngramLength),$"ngram length {ngramLength} is not in range 1..5");

var ngrams = new HashSet<Ngram>();
var textSpan = text.AsSpan();
for (var i = 0; i <= text.Length - ngramLength; i++)
{
var textSlice = textSpan.Slice(i, ngramLength);
if (LetterRegex.IsMatch(textSlice))
{
ngrams.Add(new Ngram(textSlice.ToString()));
}
if (LetterRegex().IsMatch(textSlice))
ngrams.Add(new Ngram(textSlice.ToString()));
}

return new TestDataLanguageModel(ngrams);
return new(ngrams);
}
}
}
10 changes: 4 additions & 6 deletions src/Lingua/Lingua.csproj
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<Title>Lingua</Title>
<Description>Natural language detection library for .NET, suitable for long and short text alike</Description>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>Lingua</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
Expand All @@ -19,9 +18,8 @@
</ItemGroup>

<ItemGroup>
<AssemblyAttribute Include="System.Runtime.CompilerServices.InternalsVisibleToAttribute">
<_Parameter1>$(AssemblyName).Tests</_Parameter1>
</AssemblyAttribute>
<!-- <InternalsVisibleTo Include="$(AssemblyName).Tests" Key="$(PublicKey)" />-->
<InternalsVisibleTo Include="$(AssemblyName).Tests" />
</ItemGroup>

</Project>
Loading

0 comments on commit c11a0f3

Please sign in to comment.