pjn-2024-cw/examples.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### nie robimy 2 nowych linii w bloku funkcji. sentences[::2] oraz sentences[1::2] powinny być przypisane do osobnych zmiennych"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! \n",
      "Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. \n",
      "Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. \n",
      "['Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! ', 'Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. ', 'Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. ']\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "tekst =  \"Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. Chart polski  polska rasa psa myśliwskiego, znana prawdopodobnie od czasów Galla Anonima, zaliczana do grupy chartów.\"\n",
    "def split_sentences(text):\n",
    "    sentences = re.split(r'([.!?]\\s+)(?=[A-Z])', text)\n",
    "\n",
    "\n",
    "    full_sentences = [''.join(pair) for pair in zip(sentences[::2], sentences[1::2])]\n",
    "\n",
    "\n",
    "    for sentence in full_sentences:\n",
    "        print(sentence)\n",
    "    print(full_sentences)\n",
    "split_sentences(tekst)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Niewłaściwa nazwa funkcji switch_letter (robi coś innego, niż nazwa na to wskazuje). Linijka z sum jest nieczytelna."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- faja.\n"
     ]
    }
   ],
   "source": [
    "text = \"kurde faja.\"\n",
    "\n",
    "vulgar_words_base = [\"kurd\", \"choler\"]\n",
    "\n",
    "def switch_letter(word, vulgar_word_list):\n",
    "    word = word.lower()\n",
    "    for bad_word in vulgar_word_list:\n",
    "        switched_letters = sum(1 for a, b in zip(word, bad_word) if a != b)\n",
    "        if switched_letters == 1:\n",
    "            return True\n",
    "    return False\n",
    "\n",
    "def censor_text(text):\n",
    "    pattern = re.compile(r'[^\\s]*(' + '|'.join([f'{word}' for word in vulgar_words_base]) + r')[^\\s]*', re.IGNORECASE)\n",
    "    censored_text = pattern.sub(\"---\", text)\n",
    "\n",
    "    censored_text_list = censored_text.split()\n",
    "    \n",
    "    for i, word in enumerate(censored_text_list):\n",
    "        if switch_letter(word, vulgar_words_base):\n",
    "            censored_text_list[i] = \"---\"\n",
    "    final_censored_text = \" \".join(censored_text_list)\n",
    "\n",
    "    return final_censored_text\n",
    "\n",
    "print(censor_text(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "switch_letter(\"kurcze\", [\"kurzce\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Jeżeli nie ma takiej konieczności nie iterujemy po rozdzielonym na słowa tekście, tylko na całym tekście."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Siała baba mak.\n",
      "Czy wiedziała jak?\n",
      "Dziadek wiedział, nie powiedział, a to było tak!\n"
     ]
    }
   ],
   "source": [
    "# Solution 2\n",
    "text = 'Siała baba mak. Czy wiedziała jak? Dziadek wiedział, nie powiedział, a to było tak!'\n",
    "sentences = []\n",
    "\n",
    "def split_sentences(text):\n",
    "    sentence = ''\n",
    "    for word in text.split():\n",
    "        x = re.search(r'[a-zA-Z0-9]+[.?!]', word)\n",
    "        if x is None:\n",
    "            sentence += f'{word} '\n",
    "        else:\n",
    "            sentence += word\n",
    "            sentences.append(sentence)\n",
    "            sentence = ''\n",
    "    for result in sentences:\n",
    "        print(result)\n",
    "\n",
    "\n",
    "split_sentences(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Nie stosujemy zapisu if {zmienna}, tylko if {zmienna} is True/False. Kod dla danego warunku przenosimy do nowej linii"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def validate_name(name):\n",
    "    valid = re.match(r'^[A-Z][a-z]{1,}',name)\n",
    "    if valid: return True\n",
    "    else:  return False\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Przykład właściwego zastosowania komentarza"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def censor_text(text):\n",
    "    prefixes = r'(do|na|o|od|pod|po|prze|przy|roz|s|u|w|y|za|z|u)*'\n",
    "\n",
    "    # profanities according to prof. Jerzy Bralczyk\n",
    "    profanities = [ \n",
    "        rf'\\b{prefixes}(pierd\\w*)\\b',\n",
    "    ]\n",
    "\n",
    "    profanity_pattern = re.compile('|'.join(profanities), re.IGNORECASE)\n",
    "\n",
    "    return profanity_pattern.sub('---', text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# jeżeli ten if określa 3 warianty na tym samym poziomie, to nie stosujemy zagnieżdżenia warunków\n",
    "if [ \"$positive_count\" -gt \"$negative_count\" ]; then\n",
    "    echo \"wydzwiek pozytywny\"\n",
    "else\n",
    "    if [ \"$negative_count\" -gt \"$positive_count\" ]; then\n",
    "        echo \"wydzwiek: negatywny\"\n",
    "    else\n",
    "        echo \"wydzwiek: neutralny\"\n",
    "    fi\n",
    "fi\n",
    "\n",
    "\n",
    "# ten else nigdy się nie wywoła - nie powinno go być\n",
    "if [ $positive_count -gt $negative_count ]\n",
    "        then echo \"Positive\"\n",
    "elif [ $positive_count -lt $negative_count ]\n",
    "        then echo \"Negative\"\n",
    "elif [ $positive_count -eq $negative_count ]\n",
    "        then echo \"Neutral\"\n",
    "else\n",
    "        echo \"Error\" # to nie istnieje\n",
    "fi\n",
    "\n",
    "\n",
    "# positive - taki błąd mocno rzuca się w oczy (mimo że program działa)\n",
    "POZITIVE=\"positive-words.txt\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notebook 05"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# algorytm wzięty z pseudokodu z prezentacji profesora Jassema\n",
    "def maxmatch_text_split(text, vocabulary):\n",
    "    if text == \"\":\n",
    "        return []\n",
    "    for i in range(len(text)-1, -1, -1):\n",
    "        firstword = text[0:i+1] # nie piszemy [0:x] tylko [:x]\n",
    "        reminder = text[i+1:]\n",
    "        if firstword in vocabulary:\n",
    "            return [firstword] + maxmatch_text_split(reminder, vocabulary)\n",
    "    firstword = text[0]\n",
    "    reminder = text[1]\n",
    "    return [firstword] + maxmatch_text_split(reminder, vocabulary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_bpe_tokenizer(text, max_vocab_length):\n",
    "    nfoiwanfoiwa\n",
    "    \n",
    "    for x in range(10):\n",
    "        nfwoiaf\n",
    "        \n",
    "    awfnoia\n",
    "    if noiawniofa:\n",
    "        iognioe\n",
    "    else:\n",
    "        nawoinoigagna\n",
    "    fniaw..\n",
    "\n",
    "    return 0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "from collections import Counter\n",
    "import re\n",
    "\n",
    "\n",
    "def create_bpe_tokenizer(text, max_vocab_length):\n",
    "    text  = (\"\".join(x for x in text if x not in string.punctuation)).lower()\n",
    "    vocabulary = list(set([x for x in text]))\n",
    "    while len(vocabulary)<max_vocab_length:\n",
    "        text = re.findall(\"|\".join(vocabulary), \"\".join(text))\n",
    "        list_bigrams = []\n",
    "        for i in range(0, len(text)-1):\n",
    "            list_bigrams.append(\"\".join(text[i:i+2]))\n",
    "        bi_freq = Counter(list_bigrams)\n",
    "        if all(i == 1 for i in bi_freq.values()):\n",
    "            break\n",
    "        sorted_bigram_list = sorted(bi_freq.items(), key = lambda x: list_bigrams.index(x[0]))\n",
    "        sorted_bigram_dict={}\n",
    "        for key, value in sorted_bigram_list:\n",
    "            sorted_bigram_dict[key] = value\n",
    "        vocabulary.append(max(sorted_bigram_dict, key=sorted_bigram_dict.get))\n",
    "        vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
    "    vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
    "    text = re.findall(\"|\".join(vocabulary), \"\".join(text))\n",
    "    # print( len(vocabulary), sorted(vocabulary, key = len))\n",
    "    return vocabulary\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Próba \"uratowania\" powyższego kodu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def all_frequencies_are_ones(bigram_freqs):\n",
    "    return all(i == 1 for i in bigram_freqs.values())\n",
    "\n",
    "\n",
    "def create_bpe_tokenizer2(text, max_vocab_length):\n",
    "    text  = (\"\".join(x for x in text if x not in string.punctuation)).lower()\n",
    "    vocabulary = list(set(text))\n",
    "\n",
    "    while len(vocabulary) < max_vocab_length:\n",
    "        text = re.findall(\"|\".join(vocabulary), \"\".join(text))\n",
    "        bigrams = []\n",
    "\n",
    "        for i in range(0, len(text)-1):\n",
    "            bigrams.append(\"\".join(text[i:i+2]))\n",
    "\n",
    "        bigram_freq = Counter(bigrams)\n",
    "        if all_frequencies_are_ones(bigram_freq):\n",
    "            break\n",
    "\n",
    "        most_common_bigram = bigram_freq.most_common(1)[0][0]\n",
    "        vocabulary.append(most_common_bigram)\n",
    "        vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
    "    \n",
    "    vocabulary = sorted(vocabulary, key = len, reverse=True)\n",
    "\n",
    "    return vocabulary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Warto zapoznać się z obiektami z paczki collections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "pairs = {}\n",
    "for sequence in vocab:\n",
    "    symbols = sequence.split()\n",
    "    for i in range(len(symbols) - 1):\n",
    "        pair = (symbols[i], symbols[i + 1])\n",
    "        if pair in pairs:\n",
    "            pairs[pair] += vocab[sequence]\n",
    "        else:\n",
    "            pairs[pair] = vocab[sequence]\n",
    "\n",
    "# to samo co\n",
    "pairs = defaultdict(int)\n",
    "for sequence in vocab:\n",
    "    symbols = sequence.split()\n",
    "    for i in range(len(symbols) - 1):\n",
    "        pair = (symbols[i], symbols[i + 1])\n",
    "        pairs[pair] += vocab[sequence]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Nie uzywamy dlugich slow na iteratory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def maxmatch_text_split(text, vocabulary):\n",
    "    words_list = []\n",
    "    iterator = 0\n",
    "    \n",
    "    while iterator < len(text):\n",
    "        \n",
    "        for backwards_iterator in range(len(text), iterator, -1):\n",
    "            # if text[iterator : backwards_iterator] in vocabulary: \n",
    "            if text[iterator : backwards_iterator].lower() in vocabulary: #.lower() because every token is lower case in vocab\n",
    "                words_list.append(text[iterator : backwards_iterator]) #.lower() if want to have exact same tokens as in vocab\n",
    "                break\n",
    "            elif backwards_iterator == iterator + 1:\n",
    "                words_list.append(text[iterator : backwards_iterator])\n",
    "                \n",
    "        iterator += len(words_list[-1])\n",
    "    return words_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Niedopuszczalne są takie nazwy zmiennych (z błędami!)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictinary_of_pairs_occurance = {}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Uproszczenie funkcji"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def has_vowel(word):\n",
    "    check = False\n",
    "    for i in range(0,len(word)):\n",
    "        if is_vowel(word, i) == True:\n",
    "            check = True\n",
    "            break\n",
    "    return check\n",
    "\n",
    "\n",
    "def has_vowel(word):\n",
    "    for x in range(len(word)):\n",
    "        if is_vowel(word, x) is True:\n",
    "            return True\n",
    "    \n",
    "    return False"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
Update files and add examples 2024-10-23 15:13:08 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### nie robimy 2 nowych linii w bloku funkcji. sentences[::2] oraz sentences[1::2] powinny być przypisane do osobnych zmiennych"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! \n",`
			`"Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. \n",`
			`"Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. \n",`
			`"['Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! ', 'Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. ', 'Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. ']\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"import re\n",`
			`"tekst = \"Pies ten pochodzi z południowych Chin, z terenów prowincji Guangdong! Został rozpropagowany i hodowany w celach wystawowych przez hodowców w USA. Nazwa psa, pochodząca z chińskiego, oznacza dosłownie piaszczysta skóra. Chart polski polska rasa psa myśliwskiego, znana prawdopodobnie od czasów Galla Anonima, zaliczana do grupy chartów.\"\n",`
			`"def split_sentences(text):\n",`
			`" sentences = re.split(r'([.!?]\\s+)(?=[A-Z])', text)\n",`
			`"\n",`
			`"\n",`
			`" full_sentences = [''.join(pair) for pair in zip(sentences[::2], sentences[1::2])]\n",`
			`"\n",`
			`"\n",`
			`" for sentence in full_sentences:\n",`
			`" print(sentence)\n",`
			`" print(full_sentences)\n",`
			`"split_sentences(tekst)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Niewłaściwa nazwa funkcji switch_letter (robi coś innego, niż nazwa na to wskazuje). Linijka z sum jest nieczytelna."`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 10,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"--- faja.\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"text = \"kurde faja.\"\n",`
			`"\n",`
			`"vulgar_words_base = [\"kurd\", \"choler\"]\n",`
			`"\n",`
			`"def switch_letter(word, vulgar_word_list):\n",`
			`" word = word.lower()\n",`
			`" for bad_word in vulgar_word_list:\n",`
			`" switched_letters = sum(1 for a, b in zip(word, bad_word) if a != b)\n",`
			`" if switched_letters == 1:\n",`
			`" return True\n",`
			`" return False\n",`
			`"\n",`
			`"def censor_text(text):\n",`
			`" pattern = re.compile(r'[^\\s](' + '\|'.join([f'{word}' for word in vulgar_words_base]) + r')[^\\s]', re.IGNORECASE)\n",`
			`" censored_text = pattern.sub(\"---\", text)\n",`
			`"\n",`
			`" censored_text_list = censored_text.split()\n",`
			`" \n",`
			`" for i, word in enumerate(censored_text_list):\n",`
			`" if switch_letter(word, vulgar_words_base):\n",`
			`" censored_text_list[i] = \"---\"\n",`
			`" final_censored_text = \" \".join(censored_text_list)\n",`
			`"\n",`
			`" return final_censored_text\n",`
			`"\n",`
			`"print(censor_text(text))"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 7,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"False"`
			`]`
			`},`
			`"execution_count": 7,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"switch_letter(\"kurcze\", [\"kurzce\"])"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Jeżeli nie ma takiej konieczności nie iterujemy po rozdzielonym na słowa tekście, tylko na całym tekście."`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Siała baba mak.\n",`
			`"Czy wiedziała jak?\n",`
			`"Dziadek wiedział, nie powiedział, a to było tak!\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"# Solution 2\n",`
			`"text = 'Siała baba mak. Czy wiedziała jak? Dziadek wiedział, nie powiedział, a to było tak!'\n",`
			`"sentences = []\n",`
			`"\n",`
			`"def split_sentences(text):\n",`
			`" sentence = ''\n",`
			`" for word in text.split():\n",`
			`" x = re.search(r'[a-zA-Z0-9]+[.?!]', word)\n",`
			`" if x is None:\n",`
			`" sentence += f'{word} '\n",`
			`" else:\n",`
			`" sentence += word\n",`
			`" sentences.append(sentence)\n",`
			`" sentence = ''\n",`
			`" for result in sentences:\n",`
			`" print(result)\n",`
			`"\n",`
			`"\n",`
			`"split_sentences(text)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Nie stosujemy zapisu if {zmienna}, tylko if {zmienna} is True/False. Kod dla danego warunku przenosimy do nowej linii"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import re\n",`
			`"\n",`
			`"def validate_name(name):\n",`
			`" valid = re.match(r'^[A-Z][a-z]{1,}',name)\n",`
			`" if valid: return True\n",`
			`" else: return False\n"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Przykład właściwego zastosowania komentarza"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def censor_text(text):\n",`
			`" prefixes = r'(do\|na\|o\|od\|pod\|po\|prze\|przy\|roz\|s\|u\|w\|y\|za\|z\|u)*'\n",`
			`"\n",`
			`" # profanities according to prof. Jerzy Bralczyk\n",`
			`" profanities = [ \n",`
			`" rf'\\b{prefixes}(pierd\\w*)\\b',\n",`
			`" ]\n",`
			`"\n",`
			`" profanity_pattern = re.compile('\|'.join(profanities), re.IGNORECASE)\n",`
			`"\n",`
			`" return profanity_pattern.sub('---', text)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
Add & update files 2024-11-13 13:27:23 +01:00			`"source": [`
			`"# jeżeli ten if określa 3 warianty na tym samym poziomie, to nie stosujemy zagnieżdżenia warunków\n",`
			`"if [ \"$positive_count\" -gt \"$negative_count\" ]; then\n",`
			`" echo \"wydzwiek pozytywny\"\n",`
			`"else\n",`
			`" if [ \"$negative_count\" -gt \"$positive_count\" ]; then\n",`
			`" echo \"wydzwiek: negatywny\"\n",`
			`" else\n",`
			`" echo \"wydzwiek: neutralny\"\n",`
			`" fi\n",`
			`"fi\n",`
			`"\n",`
			`"\n",`
			`"# ten else nigdy się nie wywoła - nie powinno go być\n",`
			`"if [ $positive_count -gt $negative_count ]\n",`
			`" then echo \"Positive\"\n",`
			`"elif [ $positive_count -lt $negative_count ]\n",`
			`" then echo \"Negative\"\n",`
			`"elif [ $positive_count -eq $negative_count ]\n",`
			`" then echo \"Neutral\"\n",`
			`"else\n",`
			`" echo \"Error\" # to nie istnieje\n",`
			`"fi\n",`
			`"\n",`
			`"\n",`
			`"# positive - taki błąd mocno rzuca się w oczy (mimo że program działa)\n",`
			`"POZITIVE=\"positive-words.txt\""`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# Notebook 05"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# algorytm wzięty z pseudokodu z prezentacji profesora Jassema\n",`
			`"def maxmatch_text_split(text, vocabulary):\n",`
			`" if text == \"\":\n",`
			`" return []\n",`
			`" for i in range(len(text)-1, -1, -1):\n",`
			`" firstword = text[0:i+1] # nie piszemy [0:x] tylko [:x]\n",`
			`" reminder = text[i+1:]\n",`
			`" if firstword in vocabulary:\n",`
			`" return [firstword] + maxmatch_text_split(reminder, vocabulary)\n",`
			`" firstword = text[0]\n",`
			`" reminder = text[1]\n",`
			`" return [firstword] + maxmatch_text_split(reminder, vocabulary)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def create_bpe_tokenizer(text, max_vocab_length):\n",`
			`" nfoiwanfoiwa\n",`
			`" \n",`
			`" for x in range(10):\n",`
			`" nfwoiaf\n",`
			`" \n",`
			`" awfnoia\n",`
			`" if noiawniofa:\n",`
			`" iognioe\n",`
			`" else:\n",`
			`" nawoinoigagna\n",`
			`" fniaw..\n",`
			`"\n",`
			`" return 0\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import string\n",`
			`"from collections import Counter\n",`
			`"import re\n",`
			`"\n",`
			`"\n",`
			`"def create_bpe_tokenizer(text, max_vocab_length):\n",`
			`" text = (\"\".join(x for x in text if x not in string.punctuation)).lower()\n",`
			`" vocabulary = list(set([x for x in text]))\n",`
			`" while len(vocabulary)<max_vocab_length:\n",`
			`" text = re.findall(\"\|\".join(vocabulary), \"\".join(text))\n",`
			`" list_bigrams = []\n",`
			`" for i in range(0, len(text)-1):\n",`
			`" list_bigrams.append(\"\".join(text[i:i+2]))\n",`
			`" bi_freq = Counter(list_bigrams)\n",`
			`" if all(i == 1 for i in bi_freq.values()):\n",`
			`" break\n",`
			`" sorted_bigram_list = sorted(bi_freq.items(), key = lambda x: list_bigrams.index(x[0]))\n",`
			`" sorted_bigram_dict={}\n",`
			`" for key, value in sorted_bigram_list:\n",`
			`" sorted_bigram_dict[key] = value\n",`
			`" vocabulary.append(max(sorted_bigram_dict, key=sorted_bigram_dict.get))\n",`
			`" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",`
			`" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",`
			`" text = re.findall(\"\|\".join(vocabulary), \"\".join(text))\n",`
			`" # print( len(vocabulary), sorted(vocabulary, key = len))\n",`
			`" return vocabulary\n"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Próba \"uratowania\" powyższego kodu"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def all_frequencies_are_ones(bigram_freqs):\n",`
			`" return all(i == 1 for i in bigram_freqs.values())\n",`
			`"\n",`
			`"\n",`
			`"def create_bpe_tokenizer2(text, max_vocab_length):\n",`
			`" text = (\"\".join(x for x in text if x not in string.punctuation)).lower()\n",`
			`" vocabulary = list(set(text))\n",`
			`"\n",`
			`" while len(vocabulary) < max_vocab_length:\n",`
			`" text = re.findall(\"\|\".join(vocabulary), \"\".join(text))\n",`
			`" bigrams = []\n",`
			`"\n",`
			`" for i in range(0, len(text)-1):\n",`
			`" bigrams.append(\"\".join(text[i:i+2]))\n",`
			`"\n",`
			`" bigram_freq = Counter(bigrams)\n",`
			`" if all_frequencies_are_ones(bigram_freq):\n",`
			`" break\n",`
			`"\n",`
			`" most_common_bigram = bigram_freq.most_common(1)[0][0]\n",`
			`" vocabulary.append(most_common_bigram)\n",`
			`" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",`
			`" \n",`
			`" vocabulary = sorted(vocabulary, key = len, reverse=True)\n",`
			`"\n",`
			`" return vocabulary"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Warto zapoznać się z obiektami z paczki collections"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from collections import defaultdict\n",`
			`"\n",`
			`"pairs = {}\n",`
			`"for sequence in vocab:\n",`
			`" symbols = sequence.split()\n",`
			`" for i in range(len(symbols) - 1):\n",`
			`" pair = (symbols[i], symbols[i + 1])\n",`
			`" if pair in pairs:\n",`
			`" pairs[pair] += vocab[sequence]\n",`
			`" else:\n",`
			`" pairs[pair] = vocab[sequence]\n",`
			`"\n",`
			`"# to samo co\n",`
			`"pairs = defaultdict(int)\n",`
			`"for sequence in vocab:\n",`
			`" symbols = sequence.split()\n",`
			`" for i in range(len(symbols) - 1):\n",`
			`" pair = (symbols[i], symbols[i + 1])\n",`
			`" pairs[pair] += vocab[sequence]"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Nie uzywamy dlugich slow na iteratory"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def maxmatch_text_split(text, vocabulary):\n",`
			`" words_list = []\n",`
			`" iterator = 0\n",`
			`" \n",`
			`" while iterator < len(text):\n",`
			`" \n",`
			`" for backwards_iterator in range(len(text), iterator, -1):\n",`
			`" # if text[iterator : backwards_iterator] in vocabulary: \n",`
			`" if text[iterator : backwards_iterator].lower() in vocabulary: #.lower() because every token is lower case in vocab\n",`
			`" words_list.append(text[iterator : backwards_iterator]) #.lower() if want to have exact same tokens as in vocab\n",`
			`" break\n",`
			`" elif backwards_iterator == iterator + 1:\n",`
			`" words_list.append(text[iterator : backwards_iterator])\n",`
			`" \n",`
			`" iterator += len(words_list[-1])\n",`
			`" return words_list"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Niedopuszczalne są takie nazwy zmiennych (z błędami!)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"dictinary_of_pairs_occurance = {}"`
			`]`
Add files 2024-11-20 13:35:28 +01:00			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"#### Uproszczenie funkcji"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def has_vowel(word):\n",`
			`" check = False\n",`
			`" for i in range(0,len(word)):\n",`
			`" if is_vowel(word, i) == True:\n",`
			`" check = True\n",`
			`" break\n",`
			`" return check\n",`
			`"\n",`
			`"\n",`
			`"def has_vowel(word):\n",`
			`" for x in range(len(word)):\n",`
			`" if is_vowel(word, x) is True:\n",`
			`" return True\n",`
			`" \n",`
			`" return False"`
			`]`
Update files and add examples 2024-10-23 15:13:08 +02:00			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "base",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.11.5"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`