lemmatization proof of concept
This commit is contained in:
parent
6eb2940d59
commit
803ea2660f
1
.gitignore
vendored
1
.gitignore
vendored
@ -15,6 +15,7 @@ db/pgbouncer.pid
|
|||||||
db/pgbouncer.ini
|
db/pgbouncer.ini
|
||||||
upstart/concordia-server.conf
|
upstart/concordia-server.conf
|
||||||
upstart/pgbouncer.conf
|
upstart/pgbouncer.conf
|
||||||
|
upstart/lemmagen.conf
|
||||||
cat/host.cfg
|
cat/host.cfg
|
||||||
mgiza-aligner/mgiza/mgizapp/CMakeCache.txt
|
mgiza-aligner/mgiza/mgizapp/CMakeCache.txt
|
||||||
mgiza-aligner/mgiza/mgizapp/CMakeFiles/
|
mgiza-aligner/mgiza/mgizapp/CMakeFiles/
|
||||||
|
@ -27,6 +27,7 @@ configure_file (
|
|||||||
)
|
)
|
||||||
|
|
||||||
set(COMPILED_BINARIES_PATH "${concordia-server_SOURCE_DIR}/build/concordia-server")
|
set(COMPILED_BINARIES_PATH "${concordia-server_SOURCE_DIR}/build/concordia-server")
|
||||||
|
set(LEMMAGEN_BINARIES_PATH "${concordia-server_SOURCE_DIR}/LemmaGenSockets/LemmaGenSockets/bin/Debug")
|
||||||
set(SCRIPTS_PATH "${concordia-server_SOURCE_DIR}/scripts")
|
set(SCRIPTS_PATH "${concordia-server_SOURCE_DIR}/scripts")
|
||||||
configure_file (
|
configure_file (
|
||||||
"${concordia-server_SOURCE_DIR}/scripts/cmake_stubs/start.sh.in"
|
"${concordia-server_SOURCE_DIR}/scripts/cmake_stubs/start.sh.in"
|
||||||
@ -55,6 +56,12 @@ configure_file (
|
|||||||
"${concordia-server_SOURCE_DIR}/upstart/pgbouncer.conf"
|
"${concordia-server_SOURCE_DIR}/upstart/pgbouncer.conf"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
configure_file (
|
||||||
|
"${concordia-server_SOURCE_DIR}/upstart/cmake_stubs/lemmagen.conf.in"
|
||||||
|
"${concordia-server_SOURCE_DIR}/upstart/lemmagen.conf"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
configure_file (
|
configure_file (
|
||||||
"${concordia-server_SOURCE_DIR}/db/pgbouncer.ini.in"
|
"${concordia-server_SOURCE_DIR}/db/pgbouncer.ini.in"
|
||||||
"${concordia-server_SOURCE_DIR}/db/pgbouncer.ini"
|
"${concordia-server_SOURCE_DIR}/db/pgbouncer.ini"
|
||||||
@ -119,7 +126,7 @@ endif(EXISTS ${ICU_LIB} AND EXISTS ${ICU_INCLUDE})
|
|||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
set(Boost_USE_STATIC_LIBS OFF)
|
set(Boost_USE_STATIC_LIBS OFF)
|
||||||
set(Boost_USE_STATIC_RUNTIME OFF)
|
set(Boost_USE_STATIC_RUNTIME OFF)
|
||||||
find_package(Boost COMPONENTS
|
find_package(Boost COMPONENTS
|
||||||
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
|
serialization unit_test_framework system filesystem program_options iostreams regex locale REQUIRED)
|
||||||
|
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
@ -214,8 +221,3 @@ if(EXISTS ${UTF8CASE_LIB})
|
|||||||
endif(EXISTS ${UTF8CASE_LIB})
|
endif(EXISTS ${UTF8CASE_LIB})
|
||||||
|
|
||||||
add_subdirectory(concordia-server)
|
add_subdirectory(concordia-server)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
BIN
LemmaGenSockets/LemmaGen/LemmaSharp.dll
Normal file
BIN
LemmaGenSockets/LemmaGen/LemmaSharp.dll
Normal file
Binary file not shown.
BIN
LemmaGenSockets/LemmaGen/LemmaSharpPrebuilt.dll
Normal file
BIN
LemmaGenSockets/LemmaGen/LemmaSharpPrebuilt.dll
Normal file
Binary file not shown.
BIN
LemmaGenSockets/LemmaGen/LemmaSharpPrebuiltCompact.dll
Normal file
BIN
LemmaGenSockets/LemmaGen/LemmaSharpPrebuiltCompact.dll
Normal file
Binary file not shown.
BIN
LemmaGenSockets/LemmaGen/Lzma#.dll
Normal file
BIN
LemmaGenSockets/LemmaGen/Lzma#.dll
Normal file
Binary file not shown.
22
LemmaGenSockets/LemmaGenSockets.sln
Normal file
22
LemmaGenSockets/LemmaGenSockets.sln
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio 14
|
||||||
|
VisualStudioVersion = 14.0.25420.1
|
||||||
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaGenSockets", "LemmaGenSockets\LemmaGenSockets.csproj", "{3098BC55-2CC9-4612-9F79-8C812B3BE539}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|Any CPU = Debug|Any CPU
|
||||||
|
Release|Any CPU = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
6
LemmaGenSockets/LemmaGenSockets/App.config
Normal file
6
LemmaGenSockets/LemmaGenSockets/App.config
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<configuration>
|
||||||
|
<startup>
|
||||||
|
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
|
||||||
|
</startup>
|
||||||
|
</configuration>
|
73
LemmaGenSockets/LemmaGenSockets/LemmaGenSockets.csproj
Normal file
73
LemmaGenSockets/LemmaGenSockets/LemmaGenSockets.csproj
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||||
|
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
||||||
|
<ProjectGuid>{3098BC55-2CC9-4612-9F79-8C812B3BE539}</ProjectGuid>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<AppDesignerFolder>Properties</AppDesignerFolder>
|
||||||
|
<RootNamespace>LemmaGenSockets</RootNamespace>
|
||||||
|
<AssemblyName>LemmaGenSockets</AssemblyName>
|
||||||
|
<TargetFrameworkVersion>v4.5.2</TargetFrameworkVersion>
|
||||||
|
<FileAlignment>512</FileAlignment>
|
||||||
|
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
|
||||||
|
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||||
|
<DebugSymbols>true</DebugSymbols>
|
||||||
|
<DebugType>full</DebugType>
|
||||||
|
<Optimize>false</Optimize>
|
||||||
|
<OutputPath>bin\Debug\</OutputPath>
|
||||||
|
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||||
|
<ErrorReport>prompt</ErrorReport>
|
||||||
|
<WarningLevel>4</WarningLevel>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
|
||||||
|
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||||
|
<DebugType>pdbonly</DebugType>
|
||||||
|
<Optimize>true</Optimize>
|
||||||
|
<OutputPath>bin\Release\</OutputPath>
|
||||||
|
<DefineConstants>TRACE</DefineConstants>
|
||||||
|
<ErrorReport>prompt</ErrorReport>
|
||||||
|
<WarningLevel>4</WarningLevel>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Reference Include="LemmaSharp">
|
||||||
|
<HintPath>..\LemmaGen\LemmaSharp.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="LemmaSharpPrebuilt">
|
||||||
|
<HintPath>..\LemmaGen\LemmaSharpPrebuilt.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="LemmaSharpPrebuiltCompact">
|
||||||
|
<HintPath>..\LemmaGen\LemmaSharpPrebuiltCompact.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="Lzma#">
|
||||||
|
<HintPath>..\LemmaGen\Lzma#.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="System" />
|
||||||
|
<Reference Include="System.Core" />
|
||||||
|
<Reference Include="System.Xml.Linq" />
|
||||||
|
<Reference Include="System.Data.DataSetExtensions" />
|
||||||
|
<Reference Include="Microsoft.CSharp" />
|
||||||
|
<Reference Include="System.Data" />
|
||||||
|
<Reference Include="System.Net.Http" />
|
||||||
|
<Reference Include="System.Xml" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Compile Include="LemmatizerListener.cs" />
|
||||||
|
<Compile Include="Program.cs" />
|
||||||
|
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<None Include="App.config" />
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||||
|
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
|
||||||
|
Other similar extension points exist, see Microsoft.Common.targets.
|
||||||
|
<Target Name="BeforeBuild">
|
||||||
|
</Target>
|
||||||
|
<Target Name="AfterBuild">
|
||||||
|
</Target>
|
||||||
|
-->
|
||||||
|
</Project>
|
130
LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
Normal file
130
LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Net;
|
||||||
|
using System.Net.Sockets;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using LemmaSharp;
|
||||||
|
|
||||||
|
namespace LemmaGenSockets
|
||||||
|
{
|
||||||
|
class LemmatizerListener
|
||||||
|
{
|
||||||
|
private Dictionary<String, ILemmatizer> lemmatizersDict = new Dictionary<string, ILemmatizer>();
|
||||||
|
|
||||||
|
private char[] wordInnerSeparator = { '-' };
|
||||||
|
|
||||||
|
|
||||||
|
private void initializeLemmatizers()
|
||||||
|
{
|
||||||
|
lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish));
|
||||||
|
lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English));
|
||||||
|
}
|
||||||
|
|
||||||
|
public LemmatizerListener()
|
||||||
|
{
|
||||||
|
initializeLemmatizers();
|
||||||
|
}
|
||||||
|
|
||||||
|
private string lemmatizeSentence(string languageCode, string sentence)
|
||||||
|
{
|
||||||
|
string[] tokens = sentence.Split(null);
|
||||||
|
|
||||||
|
string result = "";
|
||||||
|
foreach (string token in tokens)
|
||||||
|
{
|
||||||
|
result += lemmatizeWord(languageCode, token) + " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.Trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private string lemmatizeWord(string languageCode, string word)
|
||||||
|
{
|
||||||
|
string[] parts = word.Split(wordInnerSeparator);
|
||||||
|
if (parts.Length == 2)
|
||||||
|
{
|
||||||
|
string firstPart = parts[0];
|
||||||
|
if (!parts[0].EndsWith("o"))
|
||||||
|
{
|
||||||
|
firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
|
||||||
|
}
|
||||||
|
string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
|
||||||
|
return firstPart + "-" + secondPart;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return lemmatizersDict[languageCode].Lemmatize(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void DoListening()
|
||||||
|
{
|
||||||
|
// Data buffer for incoming data.
|
||||||
|
byte[] bytes = new Byte[1024];
|
||||||
|
|
||||||
|
string data;
|
||||||
|
|
||||||
|
// Establish the local endpoint for the socket.
|
||||||
|
IPAddress ipAddress = IPAddress.Parse("127.0.0.1");
|
||||||
|
IPEndPoint localEndPoint = new IPEndPoint(ipAddress, 11000);
|
||||||
|
|
||||||
|
// Create a TCP/IP socket.
|
||||||
|
Socket listener = new Socket(AddressFamily.InterNetwork,
|
||||||
|
SocketType.Stream, ProtocolType.Tcp);
|
||||||
|
|
||||||
|
// Bind the socket to the local endpoint and
|
||||||
|
// listen for incoming connections.
|
||||||
|
try
|
||||||
|
{
|
||||||
|
listener.Bind(localEndPoint);
|
||||||
|
listener.Listen(10);
|
||||||
|
|
||||||
|
// Start listening for connections.
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
// Program is suspended while waiting for an incoming connection.
|
||||||
|
Socket handler = listener.Accept();
|
||||||
|
data = null;
|
||||||
|
|
||||||
|
// An incoming connection needs to be processed.
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
bytes = new byte[1024];
|
||||||
|
int bytesRec = handler.Receive(bytes);
|
||||||
|
data += Encoding.UTF8.GetString(bytes, 0, bytesRec);
|
||||||
|
if (data.IndexOf("@#@") > -1)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data = data.Substring(0, data.IndexOf("@#@"));
|
||||||
|
|
||||||
|
string languageCode = data.Substring(0, 2);
|
||||||
|
string sentence = data.Substring(2);
|
||||||
|
|
||||||
|
|
||||||
|
// Show the data on the console.
|
||||||
|
// Console.WriteLine("Sentence received : "+ sentence + ", language code : "+languageCode);
|
||||||
|
|
||||||
|
// Send lemmatized data back to client.
|
||||||
|
byte[] msg = Encoding.UTF8.GetBytes(lemmatizeSentence(languageCode, sentence) + "@#@");
|
||||||
|
|
||||||
|
handler.Send(msg);
|
||||||
|
handler.Shutdown(SocketShutdown.Both);
|
||||||
|
handler.Close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
Console.WriteLine(e.ToString());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
26
LemmaGenSockets/LemmaGenSockets/Program.cs
Normal file
26
LemmaGenSockets/LemmaGenSockets/Program.cs
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Net;
|
||||||
|
using System.Net.Sockets;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using LemmaSharp;
|
||||||
|
|
||||||
|
namespace LemmaGenSockets
|
||||||
|
{
|
||||||
|
class Program
|
||||||
|
{
|
||||||
|
|
||||||
|
// Incoming data from the client.
|
||||||
|
public static string data = null;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static void Main(string[] args)
|
||||||
|
{
|
||||||
|
LemmatizerListener listener = new LemmatizerListener();
|
||||||
|
listener.DoListening();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
36
LemmaGenSockets/LemmaGenSockets/Properties/AssemblyInfo.cs
Normal file
36
LemmaGenSockets/LemmaGenSockets/Properties/AssemblyInfo.cs
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
using System.Reflection;
|
||||||
|
using System.Runtime.CompilerServices;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
|
||||||
|
// General Information about an assembly is controlled through the following
|
||||||
|
// set of attributes. Change these attribute values to modify the information
|
||||||
|
// associated with an assembly.
|
||||||
|
[assembly: AssemblyTitle("LemmaGenSockets")]
|
||||||
|
[assembly: AssemblyDescription("")]
|
||||||
|
[assembly: AssemblyConfiguration("")]
|
||||||
|
[assembly: AssemblyCompany("")]
|
||||||
|
[assembly: AssemblyProduct("LemmaGenSockets")]
|
||||||
|
[assembly: AssemblyCopyright("Copyright © 2017")]
|
||||||
|
[assembly: AssemblyTrademark("")]
|
||||||
|
[assembly: AssemblyCulture("")]
|
||||||
|
|
||||||
|
// Setting ComVisible to false makes the types in this assembly not visible
|
||||||
|
// to COM components. If you need to access a type in this assembly from
|
||||||
|
// COM, set the ComVisible attribute to true on that type.
|
||||||
|
[assembly: ComVisible(false)]
|
||||||
|
|
||||||
|
// The following GUID is for the ID of the typelib if this project is exposed to COM
|
||||||
|
[assembly: Guid("3098bc55-2cc9-4612-9f79-8c812b3be539")]
|
||||||
|
|
||||||
|
// Version information for an assembly consists of the following four values:
|
||||||
|
//
|
||||||
|
// Major Version
|
||||||
|
// Minor Version
|
||||||
|
// Build Number
|
||||||
|
// Revision
|
||||||
|
//
|
||||||
|
// You can specify all the values or you can default the Build and Revision Numbers
|
||||||
|
// by using the '*' as shown below:
|
||||||
|
// [assembly: AssemblyVersion("1.0.*")]
|
||||||
|
[assembly: AssemblyVersion("1.0.0.0")]
|
||||||
|
[assembly: AssemblyFileVersion("1.0.0.0")]
|
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
Normal file
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
Normal file
Binary file not shown.
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<configuration>
|
||||||
|
<startup>
|
||||||
|
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
|
||||||
|
</startup>
|
||||||
|
</configuration>
|
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
Normal file
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
Normal file
Binary file not shown.
Binary file not shown.
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8" ?>
|
||||||
|
<configuration>
|
||||||
|
<startup>
|
||||||
|
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
|
||||||
|
</startup>
|
||||||
|
</configuration>
|
@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
|
||||||
|
<assemblyIdentity version="1.0.0.0" name="MyApplication.app"/>
|
||||||
|
<trustInfo xmlns="urn:schemas-microsoft-com:asm.v2">
|
||||||
|
<security>
|
||||||
|
<requestedPrivileges xmlns="urn:schemas-microsoft-com:asm.v3">
|
||||||
|
<requestedExecutionLevel level="asInvoker" uiAccess="false"/>
|
||||||
|
</requestedPrivileges>
|
||||||
|
</security>
|
||||||
|
</trustInfo>
|
||||||
|
</assembly>
|
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharp.dll
Normal file
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharp.dll
Normal file
Binary file not shown.
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll
Normal file
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll
Normal file
Binary file not shown.
Binary file not shown.
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/Lzma#.dll
Normal file
BIN
LemmaGenSockets/LemmaGenSockets/bin/Debug/Lzma#.dll
Normal file
Binary file not shown.
Binary file not shown.
@ -0,0 +1,10 @@
|
|||||||
|
j:\documents\visual studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
|
||||||
|
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
|
Binary file not shown.
BIN
LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
Normal file
BIN
LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
Normal file
Binary file not shown.
BIN
LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
Normal file
BIN
LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
Normal file
Binary file not shown.
@ -1,19 +1,6 @@
|
|||||||
add_executable(concordia_server_process
|
file(GLOB main_sources "*.cpp")
|
||||||
concordia_server_process.cpp
|
|
||||||
concordia_server.cpp
|
|
||||||
index_controller.cpp
|
|
||||||
searcher_controller.cpp
|
|
||||||
json_generator.cpp
|
|
||||||
unit_dao.cpp
|
|
||||||
db_connection.cpp
|
|
||||||
query_param.cpp
|
|
||||||
string_param.cpp
|
|
||||||
int_param.cpp
|
|
||||||
logger.cpp
|
|
||||||
int_array_param.cpp
|
|
||||||
simple_search_result.cpp
|
|
||||||
complete_concordia_search_result.cpp
|
|
||||||
tm_dao.cpp
|
|
||||||
)
|
|
||||||
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)
|
|
||||||
|
|
||||||
|
add_executable(concordia_server_process
|
||||||
|
${main_sources}
|
||||||
|
)
|
||||||
|
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
#include "json_generator.hpp"
|
#include "json_generator.hpp"
|
||||||
#include "config.hpp"
|
#include "config.hpp"
|
||||||
#include "logger.hpp"
|
#include "logger.hpp"
|
||||||
|
#include "socket_lemmatizer.hpp"
|
||||||
#include "rapidjson/rapidjson.h"
|
#include "rapidjson/rapidjson.h"
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <boost/ptr_container/ptr_map.hpp>
|
#include <boost/ptr_container/ptr_map.hpp>
|
||||||
@ -26,7 +27,7 @@ ConcordiaServer::ConcordiaServer(const std::string & configFilePath)
|
|||||||
_addTm(tmId);
|
_addTm(tmId);
|
||||||
}
|
}
|
||||||
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
|
_indexController = boost::shared_ptr<IndexController> (new IndexController(_concordiasMap));
|
||||||
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
|
_searcherController = boost::shared_ptr<SearcherController> (new SearcherController(_concordiasMap));
|
||||||
}
|
}
|
||||||
|
|
||||||
ConcordiaServer::~ConcordiaServer() {
|
ConcordiaServer::~ConcordiaServer() {
|
||||||
@ -50,7 +51,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||||
} else { // json parsed
|
} else { // json parsed
|
||||||
std::string operation = _getStringParameter(d, OPERATION_PARAM);
|
std::string operation = _getStringParameter(d, OPERATION_PARAM);
|
||||||
if (operation == ADD_SENTENCE_OP) {
|
if (operation == ADD_SENTENCE_OP) {
|
||||||
std::string sourceSentence = _getStringParameter(d, SOURCE_SENTENCE_PARAM);
|
std::string sourceSentence = _getStringParameter(d, SOURCE_SENTENCE_PARAM);
|
||||||
std::string targetSentence = _getStringParameter(d, TARGET_SENTENCE_PARAM);
|
std::string targetSentence = _getStringParameter(d, TARGET_SENTENCE_PARAM);
|
||||||
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
||||||
@ -93,6 +94,15 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
|
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
|
||||||
|
} else if (operation == "lemmatize") {
|
||||||
|
std::string sentence = _getStringParameter(d, "sentence");
|
||||||
|
std::string languageCode = _getStringParameter(d, "languageCode");
|
||||||
|
SocketLemmatizer lemmatizer;
|
||||||
|
std::string lemmatizedSentence = lemmatizer.lemmatizeSentence(languageCode, sentence);
|
||||||
|
jsonWriter.StartObject();
|
||||||
|
jsonWriter.String("lemmatizedSentence");
|
||||||
|
jsonWriter.String(lemmatizedSentence.c_str());
|
||||||
|
jsonWriter.EndObject();
|
||||||
} else if (operation == REFRESH_INDEX_OP) {
|
} else if (operation == REFRESH_INDEX_OP) {
|
||||||
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
||||||
_indexController->refreshIndexFromRAM(jsonWriter, tmId);
|
_indexController->refreshIndexFromRAM(jsonWriter, tmId);
|
||||||
@ -104,7 +114,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
std::string pattern = _getStringParameter(d, PATTERN_PARAM);
|
std::string pattern = _getStringParameter(d, PATTERN_PARAM);
|
||||||
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
||||||
Logger::logString("concordia search pattern", pattern);
|
Logger::logString("concordia search pattern", pattern);
|
||||||
_searcherController->concordiaSearch(jsonWriter, pattern, tmId);
|
_searcherController->concordiaSearch(jsonWriter, pattern, tmId);
|
||||||
} else if (operation == CONCORDIA_PHRASE_SEARCH_OP) {
|
} else if (operation == CONCORDIA_PHRASE_SEARCH_OP) {
|
||||||
std::string pattern = _getStringParameter(d, PATTERN_PARAM);
|
std::string pattern = _getStringParameter(d, PATTERN_PARAM);
|
||||||
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
||||||
@ -114,31 +124,31 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
const rapidjson::Value & intervalsArray = d[INTERVALS_PARAM];
|
const rapidjson::Value & intervalsArray = d[INTERVALS_PARAM];
|
||||||
for (rapidjson::SizeType i = 0; i < intervalsArray.Size(); i++) {
|
for (rapidjson::SizeType i = 0; i < intervalsArray.Size(); i++) {
|
||||||
intervals.push_back(Interval(intervalsArray[i][0].GetInt(), intervalsArray[i][1].GetInt()));
|
intervals.push_back(Interval(intervalsArray[i][0].GetInt(), intervalsArray[i][1].GetInt()));
|
||||||
}
|
}
|
||||||
_searcherController->concordiaPhraseSearch(jsonWriter, pattern, intervals, tmId);
|
_searcherController->concordiaPhraseSearch(jsonWriter, pattern, intervals, tmId);
|
||||||
} else if (operation == ADD_TM_OP) {
|
} else if (operation == ADD_TM_OP) {
|
||||||
int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
|
int sourceLangId = _getIntParameter(d, SOURCE_LANG_PARAM);
|
||||||
int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
|
int targetLangId = _getIntParameter(d, TARGET_LANG_PARAM);
|
||||||
std::string name = _getStringParameter(d, NAME_PARAM);
|
std::string name = _getStringParameter(d, NAME_PARAM);
|
||||||
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
|
int newId = _tmDAO.addTm(sourceLangId, targetLangId, name);
|
||||||
_addTm(newId);
|
_addTm(newId);
|
||||||
|
|
||||||
jsonWriter.StartObject();
|
jsonWriter.StartObject();
|
||||||
jsonWriter.String("status");
|
jsonWriter.String("status");
|
||||||
jsonWriter.String("success");
|
jsonWriter.String("success");
|
||||||
jsonWriter.String("newTmId");
|
jsonWriter.String("newTmId");
|
||||||
jsonWriter.Int(newId);
|
jsonWriter.Int(newId);
|
||||||
jsonWriter.EndObject();
|
jsonWriter.EndObject();
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
JsonGenerator::signalError(jsonWriter, "no such operation: " + operation);
|
JsonGenerator::signalError(jsonWriter, "no such operation: " + operation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
std::stringstream errorstream;
|
std::stringstream errorstream;
|
||||||
errorstream << "concordia error: " << e.what();
|
errorstream << "concordia error: " << e.what();
|
||||||
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
JsonGenerator::signalError(jsonWriter, errorstream.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
outputString << outputJson.GetString();
|
outputString << outputJson.GetString();
|
||||||
@ -182,5 +192,5 @@ void ConcordiaServer::_logPhrase(std::string phraseString) {
|
|||||||
std::ofstream logFile;
|
std::ofstream logFile;
|
||||||
logFile.open(PHRASE_LOG_FILE_PATH, std::ios::out | std::ios::app);
|
logFile.open(PHRASE_LOG_FILE_PATH, std::ios::out | std::ios::app);
|
||||||
logFile << phraseString.substr(0, phraseString.size()-1) << ", \"timestamp\":" << std::time(0) << "}\n";
|
logFile << phraseString.substr(0, phraseString.size()-1) << ", \"timestamp\":" << std::time(0) << "}\n";
|
||||||
logFile.close();
|
logFile.close();
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef DB_MANAGER_HDR
|
#ifndef DB_CONNECTION_HDR
|
||||||
#define DB_MANAGER_HDR
|
#define DB_CONNECTION_HDR
|
||||||
|
|
||||||
#include <libpq-fe.h>
|
#include <libpq-fe.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -17,7 +17,7 @@ public:
|
|||||||
/*! Destructor.
|
/*! Destructor.
|
||||||
*/
|
*/
|
||||||
virtual ~DBconnection();
|
virtual ~DBconnection();
|
||||||
|
|
||||||
void startTransaction() throw(ConcordiaException);
|
void startTransaction() throw(ConcordiaException);
|
||||||
|
|
||||||
void endTransaction() throw(ConcordiaException);
|
void endTransaction() throw(ConcordiaException);
|
||||||
@ -28,16 +28,16 @@ public:
|
|||||||
std::vector<QueryParam*> params) throw(ConcordiaException);
|
std::vector<QueryParam*> params) throw(ConcordiaException);
|
||||||
|
|
||||||
void clearResult(PGresult * result);
|
void clearResult(PGresult * result);
|
||||||
|
|
||||||
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
int getIntValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||||
|
|
||||||
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
std::string getStringValue(PGresult * result, int row, int col) throw (ConcordiaException);
|
||||||
|
|
||||||
int getRowCount(PGresult * result) throw (ConcordiaException);
|
int getRowCount(PGresult * result) throw (ConcordiaException);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void close();
|
void close();
|
||||||
|
|
||||||
PGconn * _connection;
|
PGconn * _connection;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
90
concordia-server/socket_lemmatizer.cpp
Normal file
90
concordia-server/socket_lemmatizer.cpp
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
#include "socket_lemmatizer.hpp"
|
||||||
|
|
||||||
|
SocketLemmatizer::SocketLemmatizer() throw(ConcordiaException) :
|
||||||
|
_sock(-1) {
|
||||||
|
_connect("127.0.0.1" , 11000);
|
||||||
|
}
|
||||||
|
|
||||||
|
SocketLemmatizer::~SocketLemmatizer() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
Connect to a host on a certain port number
|
||||||
|
*/
|
||||||
|
bool SocketLemmatizer::_connect(std::string address , int port)
|
||||||
|
{
|
||||||
|
//create socket if it is not already created
|
||||||
|
if(_sock == -1) {
|
||||||
|
//Create socket
|
||||||
|
_sock = socket(AF_INET , SOCK_STREAM , 0);
|
||||||
|
if (_sock == -1) {
|
||||||
|
throw ConcordiaException("Could not create socket for the lemmatizer.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//setup address structure
|
||||||
|
if(inet_addr(address.c_str()) == -1) {
|
||||||
|
struct hostent *he;
|
||||||
|
struct in_addr **addr_list;
|
||||||
|
|
||||||
|
//resolve the hostname, its not an ip address
|
||||||
|
if ( (he = gethostbyname( address.c_str() ) ) == NULL) {
|
||||||
|
//gethostbyname failed
|
||||||
|
throw ConcordiaException("gethostbyname: Failed to resolve hostname");
|
||||||
|
}
|
||||||
|
|
||||||
|
//Cast the h_addr_list to in_addr , since h_addr_list also has the ip address in long format only
|
||||||
|
addr_list = (struct in_addr **) he->h_addr_list;
|
||||||
|
|
||||||
|
for(int i = 0; addr_list[i] != NULL; i++) {
|
||||||
|
_server.sin_addr = *addr_list[i];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else { //plain ip address
|
||||||
|
_server.sin_addr.s_addr = inet_addr(address.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
_server.sin_family = AF_INET;
|
||||||
|
_server.sin_port = htons(port);
|
||||||
|
|
||||||
|
//Connect to remote server
|
||||||
|
if (connect(_sock , (struct sockaddr *) & _server , sizeof(_server)) < 0) {
|
||||||
|
throw ConcordiaException("connect failed. Error");
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
Send data to the connected host
|
||||||
|
*/
|
||||||
|
bool SocketLemmatizer::_send_data(std::string data)
|
||||||
|
{
|
||||||
|
//Send some data
|
||||||
|
if(send(_sock , data.c_str() , strlen(data.c_str() ) , 0) < 0) {
|
||||||
|
throw ConcordiaException("Send failed");
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
Receive data from the connected host
|
||||||
|
*/
|
||||||
|
std::string SocketLemmatizer::_receive(int size=512)
|
||||||
|
{
|
||||||
|
char buffer[size];
|
||||||
|
std::string reply;
|
||||||
|
|
||||||
|
//Receive a reply from the server
|
||||||
|
if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) {
|
||||||
|
throw ConcordiaException("Receive failed");
|
||||||
|
}
|
||||||
|
reply = buffer;
|
||||||
|
return reply;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||||
|
_send_data(languageCode+sentence+"@#@");
|
||||||
|
std::string reply = _receive(512);
|
||||||
|
return reply.substr(0,reply.find("@#@"));
|
||||||
|
}
|
35
concordia-server/socket_lemmatizer.hpp
Normal file
35
concordia-server/socket_lemmatizer.hpp
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#ifndef SOCKET_LEMMATIZER_HDR
|
||||||
|
#define SOCKET_LEMMATIZER_HDR
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include<sys/socket.h> //socket
|
||||||
|
#include<arpa/inet.h> //inet_addr
|
||||||
|
#include<netdb.h> //hostent
|
||||||
|
|
||||||
|
#include <concordia/concordia_exception.hpp>
|
||||||
|
|
||||||
|
|
||||||
|
class SocketLemmatizer {
|
||||||
|
public:
|
||||||
|
/*! Constructor.
|
||||||
|
*/
|
||||||
|
SocketLemmatizer() throw(ConcordiaException);
|
||||||
|
/*! Destructor.
|
||||||
|
*/
|
||||||
|
virtual ~SocketLemmatizer();
|
||||||
|
|
||||||
|
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||||
|
private:
|
||||||
|
bool _connect(std::string, int);
|
||||||
|
|
||||||
|
bool _send_data(std::string data);
|
||||||
|
|
||||||
|
std::string _receive(int);
|
||||||
|
|
||||||
|
int _sock;
|
||||||
|
|
||||||
|
struct sockaddr_in _server;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
29
tests/lemmatizeSentence.py
Executable file
29
tests/lemmatizeSentence.py
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'lemmatize',
|
||||||
|
'languageCode':sys.argv[1],
|
||||||
|
'sentence':sys.argv[2]
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
@ -1,4 +1,4 @@
|
|||||||
In order to configure Concordia as upstart job, copy the 2 .conf files into your /etc/init and run:
|
In order to configure Concordia as upstart job, copy the 3 .conf files into your /etc/init and run:
|
||||||
|
|
||||||
sudo initctl reload-configuration
|
sudo initctl reload-configuration
|
||||||
|
|
||||||
|
23
upstart/cmake_stubs/lemmagen.conf.in
Normal file
23
upstart/cmake_stubs/lemmagen.conf.in
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# lemmagen
|
||||||
|
#
|
||||||
|
# This service runs the LemmaGen lemmatizer
|
||||||
|
# via mono. The process starts listening on the port 11000
|
||||||
|
|
||||||
|
description LemmaGen
|
||||||
|
|
||||||
|
# When to start the service
|
||||||
|
start on started concordia-server
|
||||||
|
|
||||||
|
# When to stop the service
|
||||||
|
stop on runlevel [016]
|
||||||
|
|
||||||
|
# Automatically restart process if crashed
|
||||||
|
respawn
|
||||||
|
|
||||||
|
# Essentially lets upstart know the process will detach itself to the background
|
||||||
|
expect fork
|
||||||
|
|
||||||
|
# Start the process
|
||||||
|
script
|
||||||
|
exec mono @LEMMAGEN_BINARIES_PATH@/LemmaGenSockets.exe &
|
||||||
|
end script
|
Loading…
Reference in New Issue
Block a user