lemmagen sentence lemmatizer
This commit is contained in:
parent
e558cb05d8
commit
015a916d20
Binary file not shown.
BIN
mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGen/LemmaSharp.dll
Normal file
BIN
mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGen/LemmaSharp.dll
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGen/Lzma#.dll
Normal file
BIN
mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGen/Lzma#.dll
Normal file
Binary file not shown.
@ -0,0 +1,22 @@
|
|||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio 14
|
||||||
|
VisualStudioVersion = 14.0.25420.1
|
||||||
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaGenSentenceLemmatizer", "LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer.csproj", "{66B48B61-7304-4352-8720-DD664AD06138}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|Any CPU = Debug|Any CPU
|
||||||
|
Release|Any CPU = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{66B48B61-7304-4352-8720-DD664AD06138}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{66B48B61-7304-4352-8720-DD664AD06138}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{66B48B61-7304-4352-8720-DD664AD06138}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{66B48B61-7304-4352-8720-DD664AD06138}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<configuration>
|
||||||
|
<startup>
|
||||||
|
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
|
||||||
|
</startup>
|
||||||
|
</configuration>
|
@ -0,0 +1,73 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||||
|
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
||||||
|
<ProjectGuid>{66B48B61-7304-4352-8720-DD664AD06138}</ProjectGuid>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<AppDesignerFolder>Properties</AppDesignerFolder>
|
||||||
|
<RootNamespace>LemmaGenSentenceLemmatizer</RootNamespace>
|
||||||
|
<AssemblyName>LemmaGenSentenceLemmatizer</AssemblyName>
|
||||||
|
<TargetFrameworkVersion>v4.5.2</TargetFrameworkVersion>
|
||||||
|
<FileAlignment>512</FileAlignment>
|
||||||
|
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
|
||||||
|
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||||
|
<DebugSymbols>true</DebugSymbols>
|
||||||
|
<DebugType>full</DebugType>
|
||||||
|
<Optimize>false</Optimize>
|
||||||
|
<OutputPath>bin\Debug\</OutputPath>
|
||||||
|
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||||
|
<ErrorReport>prompt</ErrorReport>
|
||||||
|
<WarningLevel>4</WarningLevel>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
|
||||||
|
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||||
|
<DebugType>pdbonly</DebugType>
|
||||||
|
<Optimize>true</Optimize>
|
||||||
|
<OutputPath>bin\Release\</OutputPath>
|
||||||
|
<DefineConstants>TRACE</DefineConstants>
|
||||||
|
<ErrorReport>prompt</ErrorReport>
|
||||||
|
<WarningLevel>4</WarningLevel>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Reference Include="LemmaSharp">
|
||||||
|
<HintPath>..\LemmaGen\LemmaSharp.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="LemmaSharpPrebuilt">
|
||||||
|
<HintPath>..\LemmaGen\LemmaSharpPrebuilt.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="LemmaSharpPrebuiltCompact">
|
||||||
|
<HintPath>..\LemmaGen\LemmaSharpPrebuiltCompact.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="Lzma#">
|
||||||
|
<HintPath>..\LemmaGen\Lzma#.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="System" />
|
||||||
|
<Reference Include="System.Core" />
|
||||||
|
<Reference Include="System.Xml.Linq" />
|
||||||
|
<Reference Include="System.Data.DataSetExtensions" />
|
||||||
|
<Reference Include="Microsoft.CSharp" />
|
||||||
|
<Reference Include="System.Data" />
|
||||||
|
<Reference Include="System.Net.Http" />
|
||||||
|
<Reference Include="System.Xml" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Compile Include="Program.cs" />
|
||||||
|
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||||
|
<Compile Include="SentenceLemmatizer.cs" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<None Include="App.config" />
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||||
|
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
|
||||||
|
Other similar extension points exist, see Microsoft.Common.targets.
|
||||||
|
<Target Name="BeforeBuild">
|
||||||
|
</Target>
|
||||||
|
<Target Name="AfterBuild">
|
||||||
|
</Target>
|
||||||
|
-->
|
||||||
|
</Project>
|
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|AnyCPU'">
|
||||||
|
<StartArguments>pl</StartArguments>
|
||||||
|
</PropertyGroup>
|
||||||
|
</Project>
|
@ -0,0 +1,32 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
|
||||||
|
namespace LemmaGenSentenceLemmatizer
|
||||||
|
{
|
||||||
|
class Program
|
||||||
|
{
|
||||||
|
static void Main(string[] args)
|
||||||
|
{
|
||||||
|
if (args.Length == 1)
|
||||||
|
{
|
||||||
|
SentenceLemmatizer lemmatizer = new SentenceLemmatizer(args[0]);
|
||||||
|
string line = Console.ReadLine();
|
||||||
|
while (!string.IsNullOrEmpty(line))
|
||||||
|
{
|
||||||
|
Console.WriteLine(lemmatizer.lemmatizeSentence(line));
|
||||||
|
line = Console.ReadLine();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Console.WriteLine("Wrong number of parameters. Please specify the language code as the only parameter");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,36 @@
|
|||||||
|
using System.Reflection;
|
||||||
|
using System.Runtime.CompilerServices;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
|
||||||
|
// General Information about an assembly is controlled through the following
|
||||||
|
// set of attributes. Change these attribute values to modify the information
|
||||||
|
// associated with an assembly.
|
||||||
|
[assembly: AssemblyTitle("LemmaGenSentenceLemmatizer")]
|
||||||
|
[assembly: AssemblyDescription("")]
|
||||||
|
[assembly: AssemblyConfiguration("")]
|
||||||
|
[assembly: AssemblyCompany("")]
|
||||||
|
[assembly: AssemblyProduct("LemmaGenSentenceLemmatizer")]
|
||||||
|
[assembly: AssemblyCopyright("Copyright © 2017")]
|
||||||
|
[assembly: AssemblyTrademark("")]
|
||||||
|
[assembly: AssemblyCulture("")]
|
||||||
|
|
||||||
|
// Setting ComVisible to false makes the types in this assembly not visible
|
||||||
|
// to COM components. If you need to access a type in this assembly from
|
||||||
|
// COM, set the ComVisible attribute to true on that type.
|
||||||
|
[assembly: ComVisible(false)]
|
||||||
|
|
||||||
|
// The following GUID is for the ID of the typelib if this project is exposed to COM
|
||||||
|
[assembly: Guid("66b48b61-7304-4352-8720-dd664ad06138")]
|
||||||
|
|
||||||
|
// Version information for an assembly consists of the following four values:
|
||||||
|
//
|
||||||
|
// Major Version
|
||||||
|
// Minor Version
|
||||||
|
// Build Number
|
||||||
|
// Revision
|
||||||
|
//
|
||||||
|
// You can specify all the values or you can default the Build and Revision Numbers
|
||||||
|
// by using the '*' as shown below:
|
||||||
|
// [assembly: AssemblyVersion("1.0.*")]
|
||||||
|
[assembly: AssemblyVersion("1.0.0.0")]
|
||||||
|
[assembly: AssemblyFileVersion("1.0.0.0")]
|
@ -0,0 +1,81 @@
|
|||||||
|
using LemmaSharp;
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
|
||||||
|
namespace LemmaGenSentenceLemmatizer
|
||||||
|
{
|
||||||
|
class SentenceLemmatizer
|
||||||
|
{
|
||||||
|
private char[] wordInnerSeparator = { '-' };
|
||||||
|
|
||||||
|
private ILemmatizer lemmatizer;
|
||||||
|
|
||||||
|
public SentenceLemmatizer(string languageCode)
|
||||||
|
{
|
||||||
|
switch (languageCode)
|
||||||
|
{
|
||||||
|
case "pl":
|
||||||
|
lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Polish);
|
||||||
|
break;
|
||||||
|
case "en":
|
||||||
|
lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
|
||||||
|
break;
|
||||||
|
case "hr":
|
||||||
|
lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Serbian);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new ArgumentException("Unknown language code: " + languageCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public string lemmatizeSentence(string sentence)
|
||||||
|
{
|
||||||
|
string[] tokens = sentence.Split(null);
|
||||||
|
|
||||||
|
string result = "";
|
||||||
|
foreach (string token in tokens)
|
||||||
|
{
|
||||||
|
result += lemmatizeWord(token) + " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.Trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private string lemmatizeWord(string word)
|
||||||
|
{
|
||||||
|
string result = "";
|
||||||
|
string[] parts = word.Split(wordInnerSeparator);
|
||||||
|
if (parts.Length == 2)
|
||||||
|
{
|
||||||
|
string firstPart = parts[0];
|
||||||
|
if (!parts[0].EndsWith("o"))
|
||||||
|
{
|
||||||
|
firstPart = lemmatizer.Lemmatize(firstPart);
|
||||||
|
}
|
||||||
|
string secondPart = lemmatizer.Lemmatize(parts[1]);
|
||||||
|
result = firstPart + "-" + secondPart;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result = lemmatizer.Lemmatize(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if (result == "" || result.Contains(" "))
|
||||||
|
{
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<configuration>
|
||||||
|
<startup>
|
||||||
|
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
|
||||||
|
</startup>
|
||||||
|
</configuration>
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<configuration>
|
||||||
|
<startup>
|
||||||
|
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
|
||||||
|
</startup>
|
||||||
|
</configuration>
|
@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
|
||||||
|
<assemblyIdentity version="1.0.0.0" name="MyApplication.app"/>
|
||||||
|
<trustInfo xmlns="urn:schemas-microsoft-com:asm.v2">
|
||||||
|
<security>
|
||||||
|
<requestedPrivileges xmlns="urn:schemas-microsoft-com:asm.v3">
|
||||||
|
<requestedExecutionLevel level="asInvoker" uiAccess="false"/>
|
||||||
|
</requestedPrivileges>
|
||||||
|
</security>
|
||||||
|
</trustInfo>
|
||||||
|
</assembly>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,10 @@
|
|||||||
|
j:\documents\visual studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\bin\Debug\LemmaGenSentenceLemmatizer.exe.config
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\bin\Debug\LemmaGenSentenceLemmatizer.exe
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\bin\Debug\LemmaGenSentenceLemmatizer.pdb
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\bin\Debug\LemmaSharp.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\bin\Debug\LemmaSharpPrebuilt.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\bin\Debug\LemmaSharpPrebuiltCompact.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\bin\Debug\Lzma#.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\obj\Debug\LemmaGenSentenceLemmatizer.csprojResolveAssemblyReference.cache
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\obj\Debug\LemmaGenSentenceLemmatizer.exe
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSentenceLemmatizer\LemmaGenSentenceLemmatizer\obj\Debug\LemmaGenSentenceLemmatizer.pdb
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user