lemmatization proof of concept

This commit is contained in:
Rafał Jaworski 2017-03-03 22:21:00 +01:00
parent 6eb2940d59
commit 803ea2660f
38 changed files with 544 additions and 41 deletions

1
.gitignore vendored
View File

@ -15,6 +15,7 @@ db/pgbouncer.pid
db/pgbouncer.ini
upstart/concordia-server.conf
upstart/pgbouncer.conf
upstart/lemmagen.conf
cat/host.cfg
mgiza-aligner/mgiza/mgizapp/CMakeCache.txt
mgiza-aligner/mgiza/mgizapp/CMakeFiles/

View File

@ -27,6 +27,7 @@ configure_file (
)
set(COMPILED_BINARIES_PATH "${concordia-server_SOURCE_DIR}/build/concordia-server")
set(LEMMAGEN_BINARIES_PATH "${concordia-server_SOURCE_DIR}/LemmaGenSockets/LemmaGenSockets/bin/Debug")
set(SCRIPTS_PATH "${concordia-server_SOURCE_DIR}/scripts")
configure_file (
"${concordia-server_SOURCE_DIR}/scripts/cmake_stubs/start.sh.in"
@ -55,6 +56,12 @@ configure_file (
"${concordia-server_SOURCE_DIR}/upstart/pgbouncer.conf"
)
configure_file (
"${concordia-server_SOURCE_DIR}/upstart/cmake_stubs/lemmagen.conf.in"
"${concordia-server_SOURCE_DIR}/upstart/lemmagen.conf"
)
configure_file (
"${concordia-server_SOURCE_DIR}/db/pgbouncer.ini.in"
"${concordia-server_SOURCE_DIR}/db/pgbouncer.ini"
@ -214,8 +221,3 @@ if(EXISTS ${UTF8CASE_LIB})
endif(EXISTS ${UTF8CASE_LIB})
add_subdirectory(concordia-server)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,22 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaGenSockets", "LemmaGenSockets\LemmaGenSockets.csproj", "{3098BC55-2CC9-4612-9F79-8C812B3BE539}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
</startup>
</configuration>

View File

@ -0,0 +1,73 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{3098BC55-2CC9-4612-9F79-8C812B3BE539}</ProjectGuid>
<OutputType>Exe</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>LemmaGenSockets</RootNamespace>
<AssemblyName>LemmaGenSockets</AssemblyName>
<TargetFrameworkVersion>v4.5.2</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<PlatformTarget>AnyCPU</PlatformTarget>
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<PlatformTarget>AnyCPU</PlatformTarget>
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="LemmaSharp">
<HintPath>..\LemmaGen\LemmaSharp.dll</HintPath>
</Reference>
<Reference Include="LemmaSharpPrebuilt">
<HintPath>..\LemmaGen\LemmaSharpPrebuilt.dll</HintPath>
</Reference>
<Reference Include="LemmaSharpPrebuiltCompact">
<HintPath>..\LemmaGen\LemmaSharpPrebuiltCompact.dll</HintPath>
</Reference>
<Reference Include="Lzma#">
<HintPath>..\LemmaGen\Lzma#.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="LemmatizerListener.cs" />
<Compile Include="Program.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>

View File

@ -0,0 +1,130 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Net.Sockets;
using System.Threading.Tasks;
using LemmaSharp;
namespace LemmaGenSockets
{
class LemmatizerListener
{
private Dictionary<String, ILemmatizer> lemmatizersDict = new Dictionary<string, ILemmatizer>();
private char[] wordInnerSeparator = { '-' };
private void initializeLemmatizers()
{
lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish));
lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English));
}
public LemmatizerListener()
{
initializeLemmatizers();
}
private string lemmatizeSentence(string languageCode, string sentence)
{
string[] tokens = sentence.Split(null);
string result = "";
foreach (string token in tokens)
{
result += lemmatizeWord(languageCode, token) + " ";
}
return result.Trim();
}
private string lemmatizeWord(string languageCode, string word)
{
string[] parts = word.Split(wordInnerSeparator);
if (parts.Length == 2)
{
string firstPart = parts[0];
if (!parts[0].EndsWith("o"))
{
firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
}
string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
return firstPart + "-" + secondPart;
}
else
{
return lemmatizersDict[languageCode].Lemmatize(word);
}
}
public void DoListening()
{
// Data buffer for incoming data.
byte[] bytes = new Byte[1024];
string data;
// Establish the local endpoint for the socket.
IPAddress ipAddress = IPAddress.Parse("127.0.0.1");
IPEndPoint localEndPoint = new IPEndPoint(ipAddress, 11000);
// Create a TCP/IP socket.
Socket listener = new Socket(AddressFamily.InterNetwork,
SocketType.Stream, ProtocolType.Tcp);
// Bind the socket to the local endpoint and
// listen for incoming connections.
try
{
listener.Bind(localEndPoint);
listener.Listen(10);
// Start listening for connections.
while (true)
{
// Program is suspended while waiting for an incoming connection.
Socket handler = listener.Accept();
data = null;
// An incoming connection needs to be processed.
while (true)
{
bytes = new byte[1024];
int bytesRec = handler.Receive(bytes);
data += Encoding.UTF8.GetString(bytes, 0, bytesRec);
if (data.IndexOf("@#@") > -1)
{
break;
}
}
data = data.Substring(0, data.IndexOf("@#@"));
string languageCode = data.Substring(0, 2);
string sentence = data.Substring(2);
// Show the data on the console.
// Console.WriteLine("Sentence received : "+ sentence + ", language code : "+languageCode);
// Send lemmatized data back to client.
byte[] msg = Encoding.UTF8.GetBytes(lemmatizeSentence(languageCode, sentence) + "@#@");
handler.Send(msg);
handler.Shutdown(SocketShutdown.Both);
handler.Close();
}
}
catch (Exception e)
{
Console.WriteLine(e.ToString());
}
}
}
}

View File

@ -0,0 +1,26 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Net.Sockets;
using System.Threading.Tasks;
using LemmaSharp;
namespace LemmaGenSockets
{
class Program
{
// Incoming data from the client.
public static string data = null;
static void Main(string[] args)
{
LemmatizerListener listener = new LemmatizerListener();
listener.DoListening();
}
}
}

View File

@ -0,0 +1,36 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyTitle("LemmaGenSockets")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("LemmaGenSockets")]
[assembly: AssemblyCopyright("Copyright © 2017")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]
// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("3098bc55-2cc9-4612-9f79-8c812b3be539")]
// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
</startup>
</configuration>

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
</startup>
</configuration>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
<assemblyIdentity version="1.0.0.0" name="MyApplication.app"/>
<trustInfo xmlns="urn:schemas-microsoft-com:asm.v2">
<security>
<requestedPrivileges xmlns="urn:schemas-microsoft-com:asm.v3">
<requestedExecutionLevel level="asInvoker" uiAccess="false"/>
</requestedPrivileges>
</security>
</trustInfo>
</assembly>

Binary file not shown.

View File

@ -0,0 +1,10 @@
j:\documents\visual studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb

View File

@ -1,19 +1,6 @@
file(GLOB main_sources "*.cpp")
add_executable(concordia_server_process
concordia_server_process.cpp
concordia_server.cpp
index_controller.cpp
searcher_controller.cpp
json_generator.cpp
unit_dao.cpp
db_connection.cpp
query_param.cpp
string_param.cpp
int_param.cpp
logger.cpp
int_array_param.cpp
simple_search_result.cpp
complete_concordia_search_result.cpp
tm_dao.cpp
${main_sources}
)
target_link_libraries(concordia_server_process fcgi fcgi++ pq concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case icuuc)

View File

@ -11,6 +11,7 @@
#include "json_generator.hpp"
#include "config.hpp"
#include "logger.hpp"
#include "socket_lemmatizer.hpp"
#include "rapidjson/rapidjson.h"
#include <boost/foreach.hpp>
#include <boost/ptr_container/ptr_map.hpp>
@ -93,6 +94,15 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
}
}
_indexController->addAlignedSentences(jsonWriter, sourceSentences, targetSentences, tmId);
} else if (operation == "lemmatize") {
std::string sentence = _getStringParameter(d, "sentence");
std::string languageCode = _getStringParameter(d, "languageCode");
SocketLemmatizer lemmatizer;
std::string lemmatizedSentence = lemmatizer.lemmatizeSentence(languageCode, sentence);
jsonWriter.StartObject();
jsonWriter.String("lemmatizedSentence");
jsonWriter.String(lemmatizedSentence.c_str());
jsonWriter.EndObject();
} else if (operation == REFRESH_INDEX_OP) {
int tmId = _getIntParameter(d, TM_ID_PARAM);
_indexController->refreshIndexFromRAM(jsonWriter, tmId);

View File

@ -1,5 +1,5 @@
#ifndef DB_MANAGER_HDR
#define DB_MANAGER_HDR
#ifndef DB_CONNECTION_HDR
#define DB_CONNECTION_HDR
#include <libpq-fe.h>
#include <string>

View File

@ -0,0 +1,90 @@
#include "socket_lemmatizer.hpp"
SocketLemmatizer::SocketLemmatizer() throw(ConcordiaException) :
_sock(-1) {
_connect("127.0.0.1" , 11000);
}
SocketLemmatizer::~SocketLemmatizer() {
}
/**
Connect to a host on a certain port number
*/
bool SocketLemmatizer::_connect(std::string address , int port)
{
//create socket if it is not already created
if(_sock == -1) {
//Create socket
_sock = socket(AF_INET , SOCK_STREAM , 0);
if (_sock == -1) {
throw ConcordiaException("Could not create socket for the lemmatizer.");
}
}
//setup address structure
if(inet_addr(address.c_str()) == -1) {
struct hostent *he;
struct in_addr **addr_list;
//resolve the hostname, its not an ip address
if ( (he = gethostbyname( address.c_str() ) ) == NULL) {
//gethostbyname failed
throw ConcordiaException("gethostbyname: Failed to resolve hostname");
}
//Cast the h_addr_list to in_addr , since h_addr_list also has the ip address in long format only
addr_list = (struct in_addr **) he->h_addr_list;
for(int i = 0; addr_list[i] != NULL; i++) {
_server.sin_addr = *addr_list[i];
break;
}
} else { //plain ip address
_server.sin_addr.s_addr = inet_addr(address.c_str());
}
_server.sin_family = AF_INET;
_server.sin_port = htons(port);
//Connect to remote server
if (connect(_sock , (struct sockaddr *) & _server , sizeof(_server)) < 0) {
throw ConcordiaException("connect failed. Error");
}
return true;
}
/**
Send data to the connected host
*/
bool SocketLemmatizer::_send_data(std::string data)
{
//Send some data
if(send(_sock , data.c_str() , strlen(data.c_str() ) , 0) < 0) {
throw ConcordiaException("Send failed");
}
return true;
}
/**
Receive data from the connected host
*/
std::string SocketLemmatizer::_receive(int size=512)
{
char buffer[size];
std::string reply;
//Receive a reply from the server
if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) {
throw ConcordiaException("Receive failed");
}
reply = buffer;
return reply;
}
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
_send_data(languageCode+sentence+"@#@");
std::string reply = _receive(512);
return reply.substr(0,reply.find("@#@"));
}

View File

@ -0,0 +1,35 @@
#ifndef SOCKET_LEMMATIZER_HDR
#define SOCKET_LEMMATIZER_HDR
#include <string>
#include<sys/socket.h> //socket
#include<arpa/inet.h> //inet_addr
#include<netdb.h> //hostent
#include <concordia/concordia_exception.hpp>
class SocketLemmatizer {
public:
/*! Constructor.
*/
SocketLemmatizer() throw(ConcordiaException);
/*! Destructor.
*/
virtual ~SocketLemmatizer();
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
private:
bool _connect(std::string, int);
bool _send_data(std::string data);
std::string _receive(int);
int _sock;
struct sockaddr_in _server;
};
#endif

29
tests/lemmatizeSentence.py Executable file
View File

@ -0,0 +1,29 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
import host
data = {
'operation': 'lemmatize',
'languageCode':sys.argv[1],
'sentence':sys.argv[2]
}
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
start = time.time()
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response

View File

@ -1,4 +1,4 @@
In order to configure Concordia as upstart job, copy the 2 .conf files into your /etc/init and run:
In order to configure Concordia as upstart job, copy the 3 .conf files into your /etc/init and run:
sudo initctl reload-configuration

View File

@ -0,0 +1,23 @@
# lemmagen
#
# This service runs the LemmaGen lemmatizer
# via mono. The process starts listening on the port 11000
description LemmaGen
# When to start the service
start on started concordia-server
# When to stop the service
stop on runlevel [016]
# Automatically restart process if crashed
respawn
# Essentially lets upstart know the process will detach itself to the background
expect fork
# Start the process
script
exec mono @LEMMAGEN_BINARIES_PATH@/LemmaGenSockets.exe &
end script