calculate frequency of bigrams in text
This commit is contained in:
parent
2afff87ba0
commit
d6be2a2ce9
4
analysis/bigrams_freq.sh
Executable file
4
analysis/bigrams_freq.sh
Executable file
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Convert text to lowercase and replace non-alphabetic characters with spaces, then use awk to print bigrams
|
||||
bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cs '[:alpha:]' ' ' | fold -w 1000 | sed -e 's/\b\s\+\b/ /g' | awk '{for(i=1;i<NF;i++)print $i,$(i+1)}' | sort | uniq -c > bigram_freq.txt
|
Loading…
Reference in New Issue
Block a user