concordia-server/mgiza-aligner/LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/SentenceLemmatizer.cs
2017-04-26 14:03:29 +02:00

86 lines
2.3 KiB
C#

using LemmaSharp;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace LemmaGenSentenceLemmatizer
{
class SentenceLemmatizer
{
private char[] wordInnerSeparator = { '-' };
private ILemmatizer lemmatizer;
public SentenceLemmatizer(string languageCode)
{
switch (languageCode)
{
case "pl":
lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Polish);
break;
case "en":
lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.English);
break;
case "hr":
lemmatizer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Serbian);
break;
default:
throw new ArgumentException("Unknown language code: " + languageCode);
}
}
public string lemmatizeSentence(string sentence)
{
string[] tokens = sentence.Split(null);
string result = "";
foreach (string token in tokens)
{
result += lemmatizeWord(token) + " ";
}
return result.Trim();
}
private string lemmatizeWord(string word)
{
if (word.StartsWith("ne_") || word == "i" || word == "o" || word=="do")
{
return word;
}
string result = "";
string[] parts = word.Split(wordInnerSeparator);
if (parts.Length == 2)
{
string firstPart = parts[0];
if (!parts[0].EndsWith("o"))
{
firstPart = lemmatizer.Lemmatize(firstPart);
}
string secondPart = lemmatizer.Lemmatize(parts[1]);
result = firstPart + "-" + secondPart;
}
else
{
result = lemmatizer.Lemmatize(word);
}
if (result == "" || result.Contains(" "))
{
return word;
}
else
{
return result;
}
}
}
}