diff --git a/analysis/word_freq.sh b/analysis/word_freq.sh new file mode 100755 index 0000000..0abbf44 --- /dev/null +++ b/analysis/word_freq.sh @@ -0,0 +1,3 @@ +#!/bin/bash +#calculate no occurances of each word +bzcat $1 | tr -s '[:punct:][:space:]' '\n' | grep -E "^[^\x00-\x7F]*[[:alpha:]][^\x00-\x7F]*$" | sort | uniq -c | sort -nr > word_freq.txt \ No newline at end of file