pjn-2024-cw/examples.ipynb

464 lines
14 KiB
Plaintext
Raw Normal View History

2024-10-23 15:13:08 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### nie robimy 2 nowych linii w bloku funkcji. sentences[::2] oraz sentences[1::2] powinny być przypisane do osobnych zmiennych"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! \n",
"Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. \n",
"Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. \n",
"['Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! ', 'Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. ', 'Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. ']\n"
]
}
],
"source": [
"import re\n",
"tekst = \"Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. Chart polski polska rasa psa myśliwskiego, znana prawdopodobnie od czasów Galla Anonima, zaliczana do grupy chartów.\"\n",
"def split_sentences(text):\n",
" sentences = re.split(r'([.!?]\\s+)(?=[A-Z])', text)\n",
"\n",
"\n",
" full_sentences = [''.join(pair) for pair in zip(sentences[::2], sentences[1::2])]\n",
"\n",
"\n",
" for sentence in full_sentences:\n",
" print(sentence)\n",
" print(full_sentences)\n",
"split_sentences(tekst)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Niewłaściwa nazwa funkcji switch_letter (robi coś innego, niż nazwa na to wskazuje). Linijka z sum jest nieczytelna."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- faja.\n"
]
}
],
"source": [
"text = \"kurde faja.\"\n",
"\n",
"vulgar_words_base = [\"kurd\", \"choler\"]\n",
"\n",
"def switch_letter(word, vulgar_word_list):\n",
" word = word.lower()\n",
" for bad_word in vulgar_word_list:\n",
" switched_letters = sum(1 for a, b in zip(word, bad_word) if a != b)\n",
" if switched_letters == 1:\n",
" return True\n",
" return False\n",
"\n",
"def censor_text(text):\n",
" pattern = re.compile(r'[^\\s]*(' + '|'.join([f'{word}' for word in vulgar_words_base]) + r')[^\\s]*', re.IGNORECASE)\n",
" censored_text = pattern.sub(\"---\", text)\n",
"\n",
" censored_text_list = censored_text.split()\n",
" \n",
" for i, word in enumerate(censored_text_list):\n",
" if switch_letter(word, vulgar_words_base):\n",
" censored_text_list[i] = \"---\"\n",
" final_censored_text = \" \".join(censored_text_list)\n",
"\n",
" return final_censored_text\n",
"\n",
"print(censor_text(text))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"switch_letter(\"kurcze\", [\"kurzce\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Jeżeli nie ma takiej konieczności nie iterujemy po rozdzielonym na słowa tekście, tylko na całym tekście."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Siała baba mak.\n",
"Czy wiedziała jak?\n",
"Dziadek wiedział, nie powiedział, a to było tak!\n"
]
}
],
"source": [
"# Solution 2\n",
"text = 'Siała baba mak. Czy wiedziała jak? Dziadek wiedział, nie powiedział, a to było tak!'\n",
"sentences = []\n",
"\n",
"def split_sentences(text):\n",
" sentence = ''\n",
" for word in text.split():\n",
" x = re.search(r'[a-zA-Z0-9]+[.?!]', word)\n",
" if x is None:\n",
" sentence += f'{word} '\n",
" else:\n",
" sentence += word\n",
" sentences.append(sentence)\n",
" sentence = ''\n",
" for result in sentences:\n",
" print(result)\n",
"\n",
"\n",
"split_sentences(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Nie stosujemy zapisu if {zmienna}, tylko if {zmienna} is True/False. Kod dla danego warunku przenosimy do nowej linii"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def validate_name(name):\n",
" valid = re.match(r'^[A-Z][a-z]{1,}',name)\n",
" if valid: return True\n",
" else: return False\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Przykład właściwego zastosowania komentarza"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def censor_text(text):\n",
" prefixes = r'(do|na|o|od|pod|po|prze|przy|roz|s|u|w|y|za|z|u)*'\n",
"\n",
" # profanities according to prof. Jerzy Bralczyk\n",
" profanities = [ \n",
" rf'\\b{prefixes}(pierd\\w*)\\b',\n",
" ]\n",
"\n",
" profanity_pattern = re.compile('|'.join(profanities), re.IGNORECASE)\n",
"\n",
" return profanity_pattern.sub('---', text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
2024-11-13 13:27:23 +01:00
"source": [
"# jeżeli ten if określa 3 warianty na tym samym poziomie, to nie stosujemy zagnieżdżenia warunków\n",
"if [ \"$positive_count\" -gt \"$negative_count\" ]; then\n",
" echo \"wydzwiek pozytywny\"\n",
"else\n",
" if [ \"$negative_count\" -gt \"$positive_count\" ]; then\n",
" echo \"wydzwiek: negatywny\"\n",
" else\n",
" echo \"wydzwiek: neutralny\"\n",
" fi\n",
"fi\n",
"\n",
"\n",
"# ten else nigdy się nie wywoła - nie powinno go być\n",
"if [ $positive_count -gt $negative_count ]\n",
" then echo \"Positive\"\n",
"elif [ $positive_count -lt $negative_count ]\n",
" then echo \"Negative\"\n",
"elif [ $positive_count -eq $negative_count ]\n",
" then echo \"Neutral\"\n",
"else\n",
" echo \"Error\" # to nie istnieje\n",
"fi\n",
"\n",
"\n",
"# positive - taki błąd mocno rzuca się w oczy (mimo że program działa)\n",
"POZITIVE=\"positive-words.txt\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook 05"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# algorytm wzięty z pseudokodu z prezentacji profesora Jassema\n",
"def maxmatch_text_split(text, vocabulary):\n",
" if text == \"\":\n",
" return []\n",
" for i in range(len(text)-1, -1, -1):\n",
" firstword = text[0:i+1] # nie piszemy [0:x] tylko [:x]\n",
" reminder = text[i+1:]\n",
" if firstword in vocabulary:\n",
" return [firstword] + maxmatch_text_split(reminder, vocabulary)\n",
" firstword = text[0]\n",
" reminder = text[1]\n",
" return [firstword] + maxmatch_text_split(reminder, vocabulary)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def create_bpe_tokenizer(text, max_vocab_length):\n",
" nfoiwanfoiwa\n",
" \n",
" for x in range(10):\n",
" nfwoiaf\n",
" \n",
" awfnoia\n",
" if noiawniofa:\n",
" iognioe\n",
" else:\n",
" nawoinoigagna\n",
" fniaw..\n",
"\n",
" return 0\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import string\n",
"from collections import Counter\n",
"import re\n",
"\n",
"\n",
"def create_bpe_tokenizer(text, max_vocab_length):\n",
" text = (\"\".join(x for x in text if x not in string.punctuation)).lower()\n",
" vocabulary = list(set([x for x in text]))\n",
" while len(vocabulary)<max_vocab_length:\n",
" text = re.findall(\"|\".join(vocabulary), \"\".join(text))\n",
" list_bigrams = []\n",
" for i in range(0, len(text)-1):\n",
" list_bigrams.append(\"\".join(text[i:i+2]))\n",
" bi_freq = Counter(list_bigrams)\n",
" if all(i == 1 for i in bi_freq.values()):\n",
" break\n",
" sorted_bigram_list = sorted(bi_freq.items(), key = lambda x: list_bigrams.index(x[0]))\n",
" sorted_bigram_dict={}\n",
" for key, value in sorted_bigram_list:\n",
" sorted_bigram_dict[key] = value\n",
" vocabulary.append(max(sorted_bigram_dict, key=sorted_bigram_dict.get))\n",
" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
" text = re.findall(\"|\".join(vocabulary), \"\".join(text))\n",
" # print( len(vocabulary), sorted(vocabulary, key = len))\n",
" return vocabulary\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Próba \"uratowania\" powyższego kodu"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def all_frequencies_are_ones(bigram_freqs):\n",
" return all(i == 1 for i in bigram_freqs.values())\n",
"\n",
"\n",
"def create_bpe_tokenizer2(text, max_vocab_length):\n",
" text = (\"\".join(x for x in text if x not in string.punctuation)).lower()\n",
" vocabulary = list(set(text))\n",
"\n",
" while len(vocabulary) < max_vocab_length:\n",
" text = re.findall(\"|\".join(vocabulary), \"\".join(text))\n",
" bigrams = []\n",
"\n",
" for i in range(0, len(text)-1):\n",
" bigrams.append(\"\".join(text[i:i+2]))\n",
"\n",
" bigram_freq = Counter(bigrams)\n",
" if all_frequencies_are_ones(bigram_freq):\n",
" break\n",
"\n",
" most_common_bigram = bigram_freq.most_common(1)[0][0]\n",
" vocabulary.append(most_common_bigram)\n",
" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
" \n",
" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
"\n",
" return vocabulary"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Warto zapoznać się z obiektami z paczki collections"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"\n",
"pairs = {}\n",
"for sequence in vocab:\n",
" symbols = sequence.split()\n",
" for i in range(len(symbols) - 1):\n",
" pair = (symbols[i], symbols[i + 1])\n",
" if pair in pairs:\n",
" pairs[pair] += vocab[sequence]\n",
" else:\n",
" pairs[pair] = vocab[sequence]\n",
"\n",
"# to samo co\n",
"pairs = defaultdict(int)\n",
"for sequence in vocab:\n",
" symbols = sequence.split()\n",
" for i in range(len(symbols) - 1):\n",
" pair = (symbols[i], symbols[i + 1])\n",
" pairs[pair] += vocab[sequence]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Nie uzywamy dlugich slow na iteratory"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def maxmatch_text_split(text, vocabulary):\n",
" words_list = []\n",
" iterator = 0\n",
" \n",
" while iterator < len(text):\n",
" \n",
" for backwards_iterator in range(len(text), iterator, -1):\n",
" # if text[iterator : backwards_iterator] in vocabulary: \n",
" if text[iterator : backwards_iterator].lower() in vocabulary: #.lower() because every token is lower case in vocab\n",
" words_list.append(text[iterator : backwards_iterator]) #.lower() if want to have exact same tokens as in vocab\n",
" break\n",
" elif backwards_iterator == iterator + 1:\n",
" words_list.append(text[iterator : backwards_iterator])\n",
" \n",
" iterator += len(words_list[-1])\n",
" return words_list"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Niedopuszczalne są takie nazwy zmiennych (z błędami!)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dictinary_of_pairs_occurance = {}"
]
2024-10-23 15:13:08 +02:00
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}