More checks in validation

This commit is contained in:
Filip Gralinski 2019-08-10 16:31:54 +02:00
parent 9b79b8761d
commit 4452095538
4 changed files with 82 additions and 10 deletions

View File

@ -1,3 +1,14 @@
## 1.18.2.0
* During validation, check the number of columns
* During validation, check the number of lines
* Validate train files
## 1.18.1.0
* During validation, check whether the maximum values is obtained with the expected data
## 1.18.0.0 ## 1.18.0.0
* Add --validate option * Add --validate option

View File

@ -1,5 +1,5 @@
name: geval name: geval
version: 1.18.1.0 version: 1.18.2.0
synopsis: Machine learning evaluation tools synopsis: Machine learning evaluation tools
description: Please see README.md description: Please see README.md
homepage: http://github.com/name/project homepage: http://github.com/name/project

View File

@ -7,7 +7,9 @@ module GEval.Metric
getMetricOrdering, getMetricOrdering,
listOfAvailableMetrics, listOfAvailableMetrics,
bestPossibleValue, bestPossibleValue,
perfectOutLineFromExpectedLine) perfectOutLineFromExpectedLine,
fixedNumberOfColumnsInExpected,
fixedNumberOfColumnsInInput)
where where
import Data.Word import Data.Word
@ -195,6 +197,17 @@ bestPossibleValue metric = case getMetricOrdering metric of
TheLowerTheBetter -> 0.0 TheLowerTheBetter -> 0.0
TheHigherTheBetter -> 1.0 TheHigherTheBetter -> 1.0
fixedNumberOfColumnsInExpected :: Metric -> Bool
fixedNumberOfColumnsInExpected MAP = False
fixedNumberOfColumnsInExpected BLEU = False
fixedNumberOfColumnsInExpected GLEU = False
fixedNumberOfColumnsInExpected _ = True
fixedNumberOfColumnsInInput :: Metric -> Bool
fixedNumberOfColumnsInInput (SoftFMeasure _) = False
fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False
fixedNumberOfColumnsInInput _ = True
perfectOutLineFromExpectedLine :: Metric -> Text -> Text perfectOutLineFromExpectedLine :: Metric -> Text -> Text
perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0" perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0" perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"

View File

@ -23,7 +23,7 @@ import qualified Data.Conduit.Text as CT
import Data.Conduit.Binary (sourceFile, sinkFile) import Data.Conduit.Binary (sourceFile, sinkFile)
import Data.Conduit.AutoDecompress (autoDecompress) import Data.Conduit.AutoDecompress (autoDecompress)
import Data.Conduit.SmartSource (compressedFilesHandled) import Data.Conduit.SmartSource (compressedFilesHandled)
import Data.List (intercalate) import Data.List (intercalate, nub)
import qualified Data.Text as T import qualified Data.Text as T
import System.IO.Temp import System.IO.Temp
@ -34,12 +34,15 @@ data ValidationException = NoChallengeDirectory FilePath
| NoReadmeFile FilePath | NoReadmeFile FilePath
| NoGitignoreFile FilePath | NoGitignoreFile FilePath
| EmptyFile FilePath | EmptyFile FilePath
| VaryingNumberOfLines
| NoTestDirectories | NoTestDirectories
| TooManyInputFiles [FilePath] | TooManyInputFiles [FilePath]
| TooManyExpectedFiles [FilePath] | TooManyExpectedFiles [FilePath]
| TooManyTrainFiles [FilePath]
| OutputFileDetected [FilePath] | OutputFileDetected [FilePath]
| CharacterCRDetected FilePath | CharacterCRDetected FilePath
| SpaceSuffixDetect FilePath | SpaceSuffixDetect FilePath
| VaryingNumberOfColumns FilePath
| BestPossibleValueNotObtainedWithExpectedData MetricValue MetricValue | BestPossibleValueNotObtainedWithExpectedData MetricValue MetricValue
instance Exception ValidationException instance Exception ValidationException
@ -51,12 +54,15 @@ instance Show ValidationException where
show (NoReadmeFile filePath) = somethingWrongWithFilesMessage "No README.md file" filePath show (NoReadmeFile filePath) = somethingWrongWithFilesMessage "No README.md file" filePath
show (NoGitignoreFile filePath) = somethingWrongWithFilesMessage "No .gitignore file" filePath show (NoGitignoreFile filePath) = somethingWrongWithFilesMessage "No .gitignore file" filePath
show (EmptyFile filePath) = somethingWrongWithFilesMessage "Empty file" filePath show (EmptyFile filePath) = somethingWrongWithFilesMessage "Empty file" filePath
show VaryingNumberOfLines = "The number of lines in input and expected file is not the same"
show NoTestDirectories = "No directories with test data, expected `dev-0` and/or `test-A` directory" show NoTestDirectories = "No directories with test data, expected `dev-0` and/or `test-A` directory"
show (TooManyInputFiles filePaths) = somethingWrongWithFilesMessage "Too many input files" $ intercalate "`, `" filePaths show (TooManyInputFiles filePaths) = somethingWrongWithFilesMessage "Too many input files" $ intercalate "`, `" filePaths
show (TooManyExpectedFiles filePaths) = somethingWrongWithFilesMessage "Too many expected files" $ intercalate "`, `" filePaths show (TooManyExpectedFiles filePaths) = somethingWrongWithFilesMessage "Too many expected files" $ intercalate "`, `" filePaths
show (TooManyTrainFiles filePaths) = somethingWrongWithFilesMessage "Too many train files" $ intercalate "`, `" filePaths
show (OutputFileDetected filePaths) = somethingWrongWithFilesMessage "Output file/s detected" $ intercalate "`, `" filePaths show (OutputFileDetected filePaths) = somethingWrongWithFilesMessage "Output file/s detected" $ intercalate "`, `" filePaths
show (CharacterCRDetected filePaths) = somethingWrongWithFilesMessage "Found CR (Carriage Return, 0x0D) character" filePaths show (CharacterCRDetected filePaths) = somethingWrongWithFilesMessage "Found CR (Carriage Return, 0x0D) character" filePaths
show (SpaceSuffixDetect filePaths) = somethingWrongWithFilesMessage "Found space at the end of line" filePaths show (SpaceSuffixDetect filePaths) = somethingWrongWithFilesMessage "Found space at the end of line" filePaths
show (VaryingNumberOfColumns filePaths) = somethingWrongWithFilesMessage "The file contains varying number of columns" filePaths
show (BestPossibleValueNotObtainedWithExpectedData expected got) = "The best possible value was not obtained with the expected data, expected: " ++ (show expected) ++ " , obtained: " ++ (show got) show (BestPossibleValueNotObtainedWithExpectedData expected got) = "The best possible value was not obtained with the expected data, expected: " ++ (show expected) ++ " , obtained: " ++ (show got)
validationChallenge :: FilePath -> GEvalSpecification -> IO () validationChallenge :: FilePath -> GEvalSpecification -> IO ()
@ -69,7 +75,8 @@ validationChallenge challengeDirectory spec = do
checkCorrectFile gitignoreFile checkCorrectFile gitignoreFile
checkCorrectFile readmeFile checkCorrectFile readmeFile
testDirectories <- findTestDirs challengeDirectory testDirectories <- findTestDirs challengeDirectory
checkTestDirectories testDirectories checkTestDirectories mainMetric testDirectories
checkTrainDirectory mainMetric challengeDirectory
mapM_ (runOnTest spec) testDirectories mapM_ (runOnTest spec) testDirectories
@ -77,7 +84,7 @@ validationChallenge challengeDirectory spec = do
configFile = challengeDirectory </> "config.txt" configFile = challengeDirectory </> "config.txt"
gitignoreFile = challengeDirectory </> ".gitignore" gitignoreFile = challengeDirectory </> ".gitignore"
readmeFile = challengeDirectory </> "README.md" readmeFile = challengeDirectory </> "README.md"
mainMetric = head $ gesMetrics spec
checkCorrectFile :: FilePath -> IO () checkCorrectFile :: FilePath -> IO ()
checkCorrectFile filePath = do checkCorrectFile filePath = do
@ -95,6 +102,19 @@ getFileLines file = runResourceT $ runConduit (sourceFile file
.| CC.map T.unpack .| CC.map T.unpack
.| CL.consume) .| CL.consume)
countLines :: FilePath -> IO Int
countLines file = do
lines <- getFileLines file
return $ length lines
numberOfColumns :: FilePath -> IO [Int]
numberOfColumns file = runResourceT $ runConduit (sourceFile file
.| autoDecompress
.| CC.decodeUtf8
.| CT.lines
.| CC.map (\t -> length $ T.splitOn "\t" t)
.| CL.consume)
createPerfectOutputFromExpected :: Metric -> FilePath -> FilePath -> IO () createPerfectOutputFromExpected :: Metric -> FilePath -> FilePath -> IO ()
createPerfectOutputFromExpected metric expectedFile outFile = do createPerfectOutputFromExpected metric expectedFile outFile = do
runResourceT $ runConduit $ (sourceFile expectedFile runResourceT $ runConduit $ (sourceFile expectedFile
@ -116,6 +136,9 @@ findInputFiles = SFF.find never $ fileFilter defaultInputFile
findOutputFiles :: FilePath -> IO [FilePath] findOutputFiles :: FilePath -> IO [FilePath]
findOutputFiles = SFF.find never $ fileFilter "out*.tsv" findOutputFiles = SFF.find never $ fileFilter "out*.tsv"
findTrainFiles :: FilePath -> IO [FilePath]
findTrainFiles = SFF.find never $ fileFilter "train.tsv"
findExpectedFiles :: FilePath -> IO [FilePath] findExpectedFiles :: FilePath -> IO [FilePath]
findExpectedFiles = SFF.find never $ fileFilter defaultExpectedFile findExpectedFiles = SFF.find never $ fileFilter defaultExpectedFile
@ -131,21 +154,28 @@ fileFilter fileName = (SFF.fileType ==? RegularFile) &&? (SFF.fileName ~~? fileN
exts = Prelude.concat [ "(", intercalate "|" compressedFilesHandled, ")" ] exts = Prelude.concat [ "(", intercalate "|" compressedFilesHandled, ")" ]
checkTestDirectories :: [FilePath] -> IO () checkTestDirectories :: Metric -> [FilePath] -> IO ()
checkTestDirectories [] = throwM NoTestDirectories checkTestDirectories _ [] = throwM NoTestDirectories
checkTestDirectories directories = mapM_ checkTestDirectory directories checkTestDirectories metric directories = mapM_ (checkTestDirectory metric) directories
checkTestDirectory :: FilePath -> IO () checkTestDirectory :: Metric -> FilePath -> IO ()
checkTestDirectory directoryPath = do checkTestDirectory metric directoryPath = do
inputFiles <- findInputFiles directoryPath inputFiles <- findInputFiles directoryPath
when (null inputFiles) $ throw $ NoInputFile inputFile when (null inputFiles) $ throw $ NoInputFile inputFile
when (length inputFiles > 1) $ throw $ TooManyInputFiles inputFiles when (length inputFiles > 1) $ throw $ TooManyInputFiles inputFiles
checkCorrectFile $ head inputFiles checkCorrectFile $ head inputFiles
when (fixedNumberOfColumnsInInput metric) $ checkColumns $ head inputFiles
expectedFiles <- findExpectedFiles directoryPath expectedFiles <- findExpectedFiles directoryPath
when (null expectedFiles) $ throw $ NoExpectedFile expectedFile when (null expectedFiles) $ throw $ NoExpectedFile expectedFile
when (length expectedFiles > 1) $ throw $ TooManyExpectedFiles expectedFiles when (length expectedFiles > 1) $ throw $ TooManyExpectedFiles expectedFiles
checkCorrectFile $ head expectedFiles checkCorrectFile $ head expectedFiles
when (fixedNumberOfColumnsInExpected metric) $ checkColumns $ head expectedFiles
inputLines <- countLines $ head inputFiles
expectedLines <- countLines $ head expectedFiles
when (inputLines /= expectedLines) $ throw $ VaryingNumberOfLines
outputFiles <- findOutputFiles directoryPath outputFiles <- findOutputFiles directoryPath
unless (null outputFiles) $ throw $ OutputFileDetected outputFiles unless (null outputFiles) $ throw $ OutputFileDetected outputFiles
@ -153,6 +183,24 @@ checkTestDirectory directoryPath = do
inputFile = directoryPath </> defaultInputFile inputFile = directoryPath </> defaultInputFile
expectedFile = directoryPath </> defaultExpectedFile expectedFile = directoryPath </> defaultExpectedFile
checkTrainDirectory :: Metric -> FilePath -> IO ()
checkTrainDirectory metric challengeDirectory = do
let trainDirectory = challengeDirectory </> "train"
whenM (doesDirectoryExist trainDirectory) $ do
trainFiles <- findTrainFiles trainDirectory
when (null trainFiles) $ throw $ NoInputFile "train.tsv"
when (length trainFiles > 1) $ throw $ TooManyTrainFiles trainFiles
let [trainFile] = trainFiles
checkCorrectFile trainFile
when (fixedNumberOfColumnsInInput metric && fixedNumberOfColumnsInExpected metric) $ do
checkColumns trainFile
checkColumns :: FilePath -> IO ()
checkColumns filePath = do
columns <- numberOfColumns filePath
let uniqueColumns = nub columns
when (length uniqueColumns > 1) $ throw $ VaryingNumberOfColumns filePath
runOnTest :: GEvalSpecification -> FilePath -> IO () runOnTest :: GEvalSpecification -> FilePath -> IO ()
runOnTest spec testPath = do runOnTest spec testPath = do
[expectedFile] <- findExpectedFiles testPath [expectedFile] <- findExpectedFiles testPath