commit e14ff73fc6de6782e096294d57e340f542232a38 Author: rjawor Date: Wed May 15 21:03:15 2019 +0200 initial commit from concordia-server repository diff --git a/LemmaGen/LemmaSharp.dll b/LemmaGen/LemmaSharp.dll new file mode 100644 index 0000000..5d8380d Binary files /dev/null and b/LemmaGen/LemmaSharp.dll differ diff --git a/LemmaGen/LemmaSharpPrebuilt.dll b/LemmaGen/LemmaSharpPrebuilt.dll new file mode 100644 index 0000000..dde85f7 Binary files /dev/null and b/LemmaGen/LemmaSharpPrebuilt.dll differ diff --git a/LemmaGen/LemmaSharpPrebuiltCompact.dll b/LemmaGen/LemmaSharpPrebuiltCompact.dll new file mode 100644 index 0000000..c1b2a38 Binary files /dev/null and b/LemmaGen/LemmaSharpPrebuiltCompact.dll differ diff --git a/LemmaGen/Lzma#.dll b/LemmaGen/Lzma#.dll new file mode 100644 index 0000000..2bb9990 Binary files /dev/null and b/LemmaGen/Lzma#.dll differ diff --git a/LemmaGenSockets.sln b/LemmaGenSockets.sln new file mode 100644 index 0000000..970942e --- /dev/null +++ b/LemmaGenSockets.sln @@ -0,0 +1,22 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.25420.1 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaGenSockets", "LemmaGenSockets\LemmaGenSockets.csproj", "{3098BC55-2CC9-4612-9F79-8C812B3BE539}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Debug|Any CPU.Build.0 = Debug|Any CPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.ActiveCfg = Release|Any CPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/LemmaGenSockets/App.config b/LemmaGenSockets/App.config new file mode 100644 index 0000000..d740e88 --- /dev/null +++ b/LemmaGenSockets/App.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/LemmaGenSockets.csproj b/LemmaGenSockets/LemmaGenSockets.csproj new file mode 100644 index 0000000..eaae3e2 --- /dev/null +++ b/LemmaGenSockets/LemmaGenSockets.csproj @@ -0,0 +1,73 @@ + + + + + Debug + AnyCPU + {3098BC55-2CC9-4612-9F79-8C812B3BE539} + Exe + Properties + LemmaGenSockets + LemmaGenSockets + v4.5.2 + 512 + true + + + AnyCPU + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + AnyCPU + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + ..\LemmaGen\LemmaSharp.dll + + + ..\LemmaGen\LemmaSharpPrebuilt.dll + + + ..\LemmaGen\LemmaSharpPrebuiltCompact.dll + + + ..\LemmaGen\Lzma#.dll + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/LemmatizerListener.cs b/LemmaGenSockets/LemmatizerListener.cs new file mode 100644 index 0000000..28afaf9 --- /dev/null +++ b/LemmaGenSockets/LemmatizerListener.cs @@ -0,0 +1,180 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Net; +using System.Net.Sockets; +using System.Threading.Tasks; +using LemmaSharp; + +namespace LemmaGenSockets +{ + class LemmatizerListener + { + private Dictionary lemmatizersDict = new Dictionary(); + + private char[] wordInnerSeparator = { '-' }; + + + private void initializeLemmatizers() + { + lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish)); + lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English)); + lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian)); + lemmatizersDict.Add("fr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French)); + } + + public LemmatizerListener() + { + initializeLemmatizers(); + } + + private string lemmatizeSentence(string languageCode, string sentence) + { + if (lemmatizersDict.ContainsKey(languageCode)) + { + string[] tokens = sentence.Split(null); + + string result = ""; + foreach (string token in tokens) + { + result += lemmatizeWord(languageCode, token) + " "; + } + + return result.Trim(); + } + else + { + //if we can not lemmatize, let's not do it at all + //primum non nocere + return sentence; + } + } + + private string lemmatizeWord(string languageCode, string word) + { + // exceptions + if (word.StartsWith("ne_")) + { + return word; + } + + + Dictionary> exceptions = new Dictionary>(); + + HashSet plExceptions = new HashSet(); + plExceptions.Add("i"); + plExceptions.Add("o"); + plExceptions.Add("do"); + exceptions.Add("pl", plExceptions); + + HashSet enExceptions = new HashSet(); + enExceptions.Add("d"); + exceptions.Add("en", enExceptions); + + HashSet languageExceptions; + if (exceptions.TryGetValue(languageCode, out languageExceptions)) + { + if(languageExceptions.Contains(word)) + { + return word; + } + } + + + string result = ""; + string[] parts = word.Split(wordInnerSeparator); + if (parts.Length == 2) + { + string firstPart = parts[0]; + if (!parts[0].EndsWith("o")) + { + firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart); + } + string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]); + result = firstPart + "-" + secondPart; + } + else + { + result = lemmatizersDict[languageCode].Lemmatize(word); + } + + if (result == "") + { + return word; + } + else + { + return result; + } + } + + public void DoListening() + { + // Data buffer for incoming data. + byte[] bytes = new Byte[1024]; + + string data; + + // Establish the local endpoint for the socket. + IPAddress ipAddress = IPAddress.Parse("127.0.0.1"); + IPEndPoint localEndPoint = new IPEndPoint(ipAddress, 11000); + + // Create a TCP/IP socket. + Socket listener = new Socket(AddressFamily.InterNetwork, + SocketType.Stream, ProtocolType.Tcp); + + // Bind the socket to the local endpoint and + // listen for incoming connections. + try + { + listener.Bind(localEndPoint); + listener.Listen(10); + + // Start listening for connections. + while (true) + { + // Program is suspended while waiting for an incoming connection. + Socket handler = listener.Accept(); + data = null; + + // An incoming connection needs to be processed. + while (true) + { + bytes = new byte[1024]; + int bytesRec = handler.Receive(bytes); + data += Encoding.UTF8.GetString(bytes, 0, bytesRec); + if (data.IndexOf("@#@") > -1) + { + break; + } + } + + data = data.Substring(0, data.IndexOf("@#@")); + + string languageCode = data.Substring(0, 2); + string sentence = data.Substring(2); + + + // Show the data on the console. + // Console.WriteLine("Sentence received : "+ sentence + ", language code : "+languageCode); + + // Send lemmatized data back to client. + byte[] msg = Encoding.UTF8.GetBytes(lemmatizeSentence(languageCode, sentence) + "@#@"); + + handler.Send(msg); + handler.Shutdown(SocketShutdown.Both); + handler.Close(); + } + + } + catch (Exception e) + { + Console.WriteLine(e.ToString()); + } + + } + + + } +} diff --git a/LemmaGenSockets/Program.cs b/LemmaGenSockets/Program.cs new file mode 100644 index 0000000..b7ce8a5 --- /dev/null +++ b/LemmaGenSockets/Program.cs @@ -0,0 +1,26 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Net; +using System.Net.Sockets; +using System.Threading.Tasks; +using LemmaSharp; + +namespace LemmaGenSockets +{ + class Program + { + + // Incoming data from the client. + public static string data = null; + + + + static void Main(string[] args) + { + LemmatizerListener listener = new LemmatizerListener(); + listener.DoListening(); + } + } +} diff --git a/LemmaGenSockets/Properties/AssemblyInfo.cs b/LemmaGenSockets/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..a865875 --- /dev/null +++ b/LemmaGenSockets/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("LemmaGenSockets")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("LemmaGenSockets")] +[assembly: AssemblyCopyright("Copyright © 2017")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("3098bc55-2cc9-4612-9f79-8c812b3be539")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe b/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe new file mode 100644 index 0000000..dfb6538 Binary files /dev/null and b/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe.config b/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe.config new file mode 100644 index 0000000..d740e88 --- /dev/null +++ b/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb new file mode 100644 index 0000000..e85154f Binary files /dev/null and b/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb differ diff --git a/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe b/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe new file mode 100644 index 0000000..681ab77 Binary files /dev/null and b/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe differ diff --git a/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.config b/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.config new file mode 100644 index 0000000..d740e88 --- /dev/null +++ b/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.manifest b/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.manifest new file mode 100644 index 0000000..f96b1d6 --- /dev/null +++ b/LemmaGenSockets/bin/Debug/LemmaGenSockets.vshost.exe.manifest @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/LemmaGenSockets/bin/Debug/LemmaSharp.dll b/LemmaGenSockets/bin/Debug/LemmaSharp.dll new file mode 100644 index 0000000..5d8380d Binary files /dev/null and b/LemmaGenSockets/bin/Debug/LemmaSharp.dll differ diff --git a/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll b/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll new file mode 100644 index 0000000..dde85f7 Binary files /dev/null and b/LemmaGenSockets/bin/Debug/LemmaSharpPrebuilt.dll differ diff --git a/LemmaGenSockets/bin/Debug/LemmaSharpPrebuiltCompact.dll b/LemmaGenSockets/bin/Debug/LemmaSharpPrebuiltCompact.dll new file mode 100644 index 0000000..c1b2a38 Binary files /dev/null and b/LemmaGenSockets/bin/Debug/LemmaSharpPrebuiltCompact.dll differ diff --git a/LemmaGenSockets/bin/Debug/Lzma#.dll b/LemmaGenSockets/bin/Debug/Lzma#.dll new file mode 100644 index 0000000..2bb9990 Binary files /dev/null and b/LemmaGenSockets/bin/Debug/Lzma#.dll differ diff --git a/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache b/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache new file mode 100644 index 0000000..41637e9 Binary files /dev/null and b/LemmaGenSockets/obj/Debug/DesignTimeResolveAssemblyReferencesInput.cache differ diff --git a/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt b/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt new file mode 100644 index 0000000..dc882e2 --- /dev/null +++ b/LemmaGenSockets/obj/Debug/LemmaGenSockets.csproj.FileListAbsolute.txt @@ -0,0 +1,21 @@ +j:\documents\visual studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe +j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csproj.CoreCompileInputs.cache +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe +J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb diff --git a/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache b/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache new file mode 100644 index 0000000..58d7cf5 Binary files /dev/null and b/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache differ diff --git a/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe b/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe new file mode 100644 index 0000000..dfb6538 Binary files /dev/null and b/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb new file mode 100644 index 0000000..e85154f Binary files /dev/null and b/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb differ diff --git a/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs b/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_036C0B5B-1481-4323-8D20-8F5ADCB23D92.cs new file mode 100644 index 0000000..e69de29 diff --git a/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs b/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_5937a670-0e60-4077-877b-f7221da3dda1.cs new file mode 100644 index 0000000..e69de29 diff --git a/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs b/LemmaGenSockets/obj/Debug/TemporaryGeneratedFile_E7A71F73-0F8D-4B9B-B56E-8E70B10BC5D3.cs new file mode 100644 index 0000000..e69de29