diff --git a/analysis/advanced.sh b/analysis/advanced.sh new file mode 100755 index 0000000..548b706 --- /dev/null +++ b/analysis/advanced.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +#count comas & spaces +echo "Ammount of comas" +bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd ',' | wc -c +echo "Ammount of dots" +bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd '.' | wc -c +echo "Ammount of white spaces" +bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd ' ' | wc -c +echo "Ammount of line breaks" +bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd '\n' | wc -c + +#calculate no occurances of each word +echo "calculate no occurances of each word" +bzcat $1 | tr -s '[:punct:][:space:]' '\n' | grep -E "^[^\x00-\x7F]*[[:alpha:]][^\x00-\x7F]*$" | sort | uniq -c | sort -nr > word_freq.txt + +# Convert text to lowercase and replace non-alphabetic characters with spaces, then use awk to print bigrams +echo "calculate bigrams frequency in text" +bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cs '[:alpha:]' ' ' | fold -w 1000 | sed -e 's/\b\s\+\b/ /g' | awk '{for(i=1;i bigram_freq.txt + +#calculate no occurances of words depending on length +echo "calculate no occurances of words depending on length" +bzcat $1 | tr -cs '[:alnum:]' ' ' | tr ' ' '\n' | awk '{print length}' | sort -n | uniq -c > len_freq.txt \ No newline at end of file