diff --git a/CHANGELOG.md b/CHANGELOG.md index cd64a5d..9b27539 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ + +## 1.18.2.0 + +* During validation, check the number of columns +* During validation, check the number of lines +* Validate train files + +## 1.18.1.0 + +* During validation, check whether the maximum values is obtained with the expected data + ## 1.18.0.0 * Add --validate option diff --git a/geval.cabal b/geval.cabal index b4d7842..ba5d58b 100644 --- a/geval.cabal +++ b/geval.cabal @@ -1,5 +1,5 @@ name: geval -version: 1.18.1.0 +version: 1.18.2.0 synopsis: Machine learning evaluation tools description: Please see README.md homepage: http://github.com/name/project diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index 6776eef..d2e1e40 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -7,7 +7,9 @@ module GEval.Metric getMetricOrdering, listOfAvailableMetrics, bestPossibleValue, - perfectOutLineFromExpectedLine) + perfectOutLineFromExpectedLine, + fixedNumberOfColumnsInExpected, + fixedNumberOfColumnsInInput) where import Data.Word @@ -195,6 +197,17 @@ bestPossibleValue metric = case getMetricOrdering metric of TheLowerTheBetter -> 0.0 TheHigherTheBetter -> 1.0 +fixedNumberOfColumnsInExpected :: Metric -> Bool +fixedNumberOfColumnsInExpected MAP = False +fixedNumberOfColumnsInExpected BLEU = False +fixedNumberOfColumnsInExpected GLEU = False +fixedNumberOfColumnsInExpected _ = True + +fixedNumberOfColumnsInInput :: Metric -> Bool +fixedNumberOfColumnsInInput (SoftFMeasure _) = False +fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False +fixedNumberOfColumnsInInput _ = True + perfectOutLineFromExpectedLine :: Metric -> Text -> Text perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0" perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0" diff --git a/src/GEval/Validation.hs b/src/GEval/Validation.hs index 00ec475..2cf925e 100644 --- a/src/GEval/Validation.hs +++ b/src/GEval/Validation.hs @@ -23,7 +23,7 @@ import qualified Data.Conduit.Text as CT import Data.Conduit.Binary (sourceFile, sinkFile) import Data.Conduit.AutoDecompress (autoDecompress) import Data.Conduit.SmartSource (compressedFilesHandled) -import Data.List (intercalate) +import Data.List (intercalate, nub) import qualified Data.Text as T import System.IO.Temp @@ -34,12 +34,15 @@ data ValidationException = NoChallengeDirectory FilePath | NoReadmeFile FilePath | NoGitignoreFile FilePath | EmptyFile FilePath + | VaryingNumberOfLines | NoTestDirectories | TooManyInputFiles [FilePath] | TooManyExpectedFiles [FilePath] + | TooManyTrainFiles [FilePath] | OutputFileDetected [FilePath] | CharacterCRDetected FilePath | SpaceSuffixDetect FilePath + | VaryingNumberOfColumns FilePath | BestPossibleValueNotObtainedWithExpectedData MetricValue MetricValue instance Exception ValidationException @@ -51,12 +54,15 @@ instance Show ValidationException where show (NoReadmeFile filePath) = somethingWrongWithFilesMessage "No README.md file" filePath show (NoGitignoreFile filePath) = somethingWrongWithFilesMessage "No .gitignore file" filePath show (EmptyFile filePath) = somethingWrongWithFilesMessage "Empty file" filePath + show VaryingNumberOfLines = "The number of lines in input and expected file is not the same" show NoTestDirectories = "No directories with test data, expected `dev-0` and/or `test-A` directory" show (TooManyInputFiles filePaths) = somethingWrongWithFilesMessage "Too many input files" $ intercalate "`, `" filePaths show (TooManyExpectedFiles filePaths) = somethingWrongWithFilesMessage "Too many expected files" $ intercalate "`, `" filePaths + show (TooManyTrainFiles filePaths) = somethingWrongWithFilesMessage "Too many train files" $ intercalate "`, `" filePaths show (OutputFileDetected filePaths) = somethingWrongWithFilesMessage "Output file/s detected" $ intercalate "`, `" filePaths show (CharacterCRDetected filePaths) = somethingWrongWithFilesMessage "Found CR (Carriage Return, 0x0D) character" filePaths show (SpaceSuffixDetect filePaths) = somethingWrongWithFilesMessage "Found space at the end of line" filePaths + show (VaryingNumberOfColumns filePaths) = somethingWrongWithFilesMessage "The file contains varying number of columns" filePaths show (BestPossibleValueNotObtainedWithExpectedData expected got) = "The best possible value was not obtained with the expected data, expected: " ++ (show expected) ++ " , obtained: " ++ (show got) validationChallenge :: FilePath -> GEvalSpecification -> IO () @@ -69,7 +75,8 @@ validationChallenge challengeDirectory spec = do checkCorrectFile gitignoreFile checkCorrectFile readmeFile testDirectories <- findTestDirs challengeDirectory - checkTestDirectories testDirectories + checkTestDirectories mainMetric testDirectories + checkTrainDirectory mainMetric challengeDirectory mapM_ (runOnTest spec) testDirectories @@ -77,7 +84,7 @@ validationChallenge challengeDirectory spec = do configFile = challengeDirectory "config.txt" gitignoreFile = challengeDirectory ".gitignore" readmeFile = challengeDirectory "README.md" - + mainMetric = head $ gesMetrics spec checkCorrectFile :: FilePath -> IO () checkCorrectFile filePath = do @@ -95,6 +102,19 @@ getFileLines file = runResourceT $ runConduit (sourceFile file .| CC.map T.unpack .| CL.consume) +countLines :: FilePath -> IO Int +countLines file = do + lines <- getFileLines file + return $ length lines + +numberOfColumns :: FilePath -> IO [Int] +numberOfColumns file = runResourceT $ runConduit (sourceFile file + .| autoDecompress + .| CC.decodeUtf8 + .| CT.lines + .| CC.map (\t -> length $ T.splitOn "\t" t) + .| CL.consume) + createPerfectOutputFromExpected :: Metric -> FilePath -> FilePath -> IO () createPerfectOutputFromExpected metric expectedFile outFile = do runResourceT $ runConduit $ (sourceFile expectedFile @@ -116,6 +136,9 @@ findInputFiles = SFF.find never $ fileFilter defaultInputFile findOutputFiles :: FilePath -> IO [FilePath] findOutputFiles = SFF.find never $ fileFilter "out*.tsv" +findTrainFiles :: FilePath -> IO [FilePath] +findTrainFiles = SFF.find never $ fileFilter "train.tsv" + findExpectedFiles :: FilePath -> IO [FilePath] findExpectedFiles = SFF.find never $ fileFilter defaultExpectedFile @@ -131,21 +154,28 @@ fileFilter fileName = (SFF.fileType ==? RegularFile) &&? (SFF.fileName ~~? fileN exts = Prelude.concat [ "(", intercalate "|" compressedFilesHandled, ")" ] -checkTestDirectories :: [FilePath] -> IO () -checkTestDirectories [] = throwM NoTestDirectories -checkTestDirectories directories = mapM_ checkTestDirectory directories +checkTestDirectories :: Metric -> [FilePath] -> IO () +checkTestDirectories _ [] = throwM NoTestDirectories +checkTestDirectories metric directories = mapM_ (checkTestDirectory metric) directories -checkTestDirectory :: FilePath -> IO () -checkTestDirectory directoryPath = do +checkTestDirectory :: Metric -> FilePath -> IO () +checkTestDirectory metric directoryPath = do inputFiles <- findInputFiles directoryPath when (null inputFiles) $ throw $ NoInputFile inputFile when (length inputFiles > 1) $ throw $ TooManyInputFiles inputFiles checkCorrectFile $ head inputFiles + when (fixedNumberOfColumnsInInput metric) $ checkColumns $ head inputFiles expectedFiles <- findExpectedFiles directoryPath when (null expectedFiles) $ throw $ NoExpectedFile expectedFile when (length expectedFiles > 1) $ throw $ TooManyExpectedFiles expectedFiles checkCorrectFile $ head expectedFiles + when (fixedNumberOfColumnsInExpected metric) $ checkColumns $ head expectedFiles + + inputLines <- countLines $ head inputFiles + expectedLines <- countLines $ head expectedFiles + + when (inputLines /= expectedLines) $ throw $ VaryingNumberOfLines outputFiles <- findOutputFiles directoryPath unless (null outputFiles) $ throw $ OutputFileDetected outputFiles @@ -153,6 +183,24 @@ checkTestDirectory directoryPath = do inputFile = directoryPath defaultInputFile expectedFile = directoryPath defaultExpectedFile +checkTrainDirectory :: Metric -> FilePath -> IO () +checkTrainDirectory metric challengeDirectory = do + let trainDirectory = challengeDirectory "train" + whenM (doesDirectoryExist trainDirectory) $ do + trainFiles <- findTrainFiles trainDirectory + when (null trainFiles) $ throw $ NoInputFile "train.tsv" + when (length trainFiles > 1) $ throw $ TooManyTrainFiles trainFiles + let [trainFile] = trainFiles + checkCorrectFile trainFile + when (fixedNumberOfColumnsInInput metric && fixedNumberOfColumnsInExpected metric) $ do + checkColumns trainFile + +checkColumns :: FilePath -> IO () +checkColumns filePath = do + columns <- numberOfColumns filePath + let uniqueColumns = nub columns + when (length uniqueColumns > 1) $ throw $ VaryingNumberOfColumns filePath + runOnTest :: GEvalSpecification -> FilePath -> IO () runOnTest spec testPath = do [expectedFile] <- findExpectedFiles testPath