{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "full_pipeline.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "9c76638985c94701a2d86bb525efc606": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_67d9046a45cb462d91eb2dd14259a21b", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_75d3a999f54647d69bad407b015b3de1", "IPY_MODEL_68c7c73101124e039210b5077ac36d5b" ] } }, "67d9046a45cb462d91eb2dd14259a21b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "75d3a999f54647d69bad407b015b3de1": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_8e7cbdb93fb2490ea24f616db532648b", "_dom_classes": [], "description": "100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 20000, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 20000, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_ef225bfda78b40bd84dc81dd2b64a215" } }, "68c7c73101124e039210b5077ac36d5b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_43a31b9b301c454db0a359ce13172883", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 20000/20000 [37:10<00:00, 8.97it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_633e30a42efd458c9b6899552f5473d1" } }, "8e7cbdb93fb2490ea24f616db532648b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "ef225bfda78b40bd84dc81dd2b64a215": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "43a31b9b301c454db0a359ce13172883": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "633e30a42efd458c9b6899552f5473d1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "1438b4d8e39948be9d41796cea1bf35e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_fa644d0ae27b4f4985bb537115ffa5e4", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_7e966f59f546486595f8a18986a18b33", "IPY_MODEL_cd063db3bf0a48909dbc59ce83417556" ] } }, "fa644d0ae27b4f4985bb537115ffa5e4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "7e966f59f546486595f8a18986a18b33": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_43a63a37e4504b08bcef619d49a0c283", "_dom_classes": [], "description": "100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 19986, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 19986, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_e102711dd88e409abbac9b3658469811" } }, "cd063db3bf0a48909dbc59ce83417556": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_97d72b38ec5c4381b1e125030132739c", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 19986/19986 [01:38<00:00, 202.18it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_b603a738fa0249a2b45e5a8c725f7385" } }, "43a63a37e4504b08bcef619d49a0c283": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "e102711dd88e409abbac9b3658469811": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "97d72b38ec5c4381b1e125030132739c": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "b603a738fa0249a2b45e5a8c725f7385": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "90ea9fd32df14868b7286d00193e65f5": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_4f154dd9265f413ab629ed6683080abd", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_7f7f5bdd0e5f4201a13baf81681e51ec", "IPY_MODEL_9c49b3c0da8f4b13a7686525a180d873" ] } }, "4f154dd9265f413ab629ed6683080abd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "7f7f5bdd0e5f4201a13baf81681e51ec": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_dadf258368454db4a3a5cb31d24d6217", "_dom_classes": [], "description": "100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 11628, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 11628, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_735dc0020f43400a8189f14549c3d259" } }, "9c49b3c0da8f4b13a7686525a180d873": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_361e57bbb4e34a57ad67dff4b0d50406", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 11628/11628 [00:58<00:00, 198.06it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_6c39888179824232afe573d81aca2aca" } }, "dadf258368454db4a3a5cb31d24d6217": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "735dc0020f43400a8189f14549c3d259": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "361e57bbb4e34a57ad67dff4b0d50406": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "6c39888179824232afe573d81aca2aca": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "7fd3d8803eaf44dabe06fd7c8a1e3569": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_4b6ba75be39846b891d6cb04b9110734", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_72b4b1bb0ad14fc38a4e5a9d47ac4f27", "IPY_MODEL_47cdc9134a934e5997e0274e0aa51ed5" ] } }, "4b6ba75be39846b891d6cb04b9110734": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "72b4b1bb0ad14fc38a4e5a9d47ac4f27": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_cda7970d110c43d5b9626a917445c272", "_dom_classes": [], "description": "100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 14132, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 14132, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_95942678b94f44eb954b6e93295f54c1" } }, "47cdc9134a934e5997e0274e0aa51ed5": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_23d2f634959c4211b44bda5de65860e0", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 14132/14132 [01:05<00:00, 215.24it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_2b7d89e4271f4722ae8aab078f55e21c" } }, "cda7970d110c43d5b9626a917445c272": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "95942678b94f44eb954b6e93295f54c1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "23d2f634959c4211b44bda5de65860e0": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "2b7d89e4271f4722ae8aab078f55e21c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "code", "metadata": { "id": "J5Q-tp0U3pHl" }, "source": [ "import re\n", "import numpy as np\n", "from collections import defaultdict\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F" ], "execution_count": 2, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "9t4L-LbyOHNc" }, "source": [ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "yVPj34v-718x" }, "source": [ "with open(\"stopwords.txt\", \"r+\") as f:\n", " stop_words = f.read().split(\"\\n\")" ], "execution_count": 7, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7hDKry1k46ZJ" }, "source": [ "def clean_text(text):\n", " split = text.lower().split(\" \")\n", "\n", " # removing punctuation\n", " clean = []\n", " for token in split:\n", " token = re.sub(r'[^\\w\\s]', '', token)\n", " if token:\n", " clean.append(token)\n", " return clean" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "EZSzAKY8ALMK" }, "source": [ "def prepare_corpus(texts, min_count=1, min_word_len=1):\n", " corpus = {}\n", " counters = defaultdict(lambda: 0)\n", " idx_counter = 0\n", " for text in texts:\n", "\n", " # add to corpus\n", " for token in text:\n", " if len(token) < min_word_len or token in stop_words:\n", " continue\n", " counters[token] += 1\n", " if token not in corpus and counters[token] == min_count:\n", " corpus[token] = idx_counter\n", " idx_counter += 1\n", " return corpus" ], "execution_count": 9, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7Mpm7weQANWX" }, "source": [ "counters = defaultdict(lambda: 0)\n", "\n", "class WordCorpus:\n", " def __init__(self, corpus=None, texts=None, min_count=1, min_word_len=1):\n", " if corpus:\n", " self.corpus = corpus\n", " else:\n", " self.corpus = prepare_corpus(texts, min_count, min_word_len)\n", "\n", " def get_word_idx(self, token):\n", " token = token.lower()\n", " token = re.sub(r'[^\\w\\s]', '', token)\n", "\n", " return self.corpus.get(token, None)\n", "\n", " def get_embedding(self, token, encode=False):\n", " embedding = np.zeros(len(self.corpus), dtype=np.int32)\n", " if encode:\n", " token_idx = token\n", " else:\n", " token = token.lower()\n", " token = re.sub(r'[^\\w\\s]', '', token)\n", " if not token or token not in self.corpus:\n", " return embedding\n", "\n", " token_idx = self.corpus[token]\n", " embedding[token_idx] = 1\n", " return embedding\n", "\n", " def get_bow(self, text, encode=False):\n", " if encode:\n", " embeddings = [\n", " self.get_embedding(token, encode) for token in text\n", " ]\n", "\n", " return np.sum(embeddings, axis=0)\n", " else:\n", " bow = np.zeros(len(self.corpus), dtype=np.int32)\n", " for token in text:\n", " bow[token] += 1\n", " return bow" ], "execution_count": 10, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "IjOoR5qyAQSS" }, "source": [ "def load_train_data(train_path):\n", " texts = []\n", " with open(train_path, \"r+\") as file:\n", " while True:\n", " line = file.readline()\n", " if not line:\n", " break\n", "\n", " _, _, _, _, text, *_ = line.split(\"\\t\")\n", " texts.append(clean_text(text))\n", " print(f\"Loaded {len(texts)} texts from train_set.\")\n", " return texts" ], "execution_count": 11, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oYgV8yPJGqq3" }, "source": [ "class LanguageNeuralModel(nn.Module):\n", " def __init__(self, corpus_size, hidden_size):\n", " super().__init__()\n", " self.input = nn.Linear(corpus_size, hidden_size)\n", " self.hidden = nn.Linear(hidden_size, hidden_size)\n", " self.output = nn.Linear(hidden_size, corpus_size)\n", "\n", " def forward(self, x):\n", " x = self.input(x)\n", " x = F.relu(x)\n", " x = self.hidden(x)\n", " x = F.relu(x)\n", "\n", " x = self.output(x)\n", " return x" ], "execution_count": 12, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vAmveHcBGtrf" }, "source": [ "def get_random_word_with_contexts(text, context_size):\n", " allowed_indexes = np.arange(context_size, len(text) - context_size)\n", " if not len(allowed_indexes):\n", " return None, None\n", " word_idx = np.random.choice(allowed_indexes)\n", " word = text[word_idx]\n", " context = text[(word_idx - context_size):word_idx] + text[(word_idx + 1):(word_idx + 1 + context_size)]\n", " return word, context" ], "execution_count": 13, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6N72wcXIIPFu", "outputId": "01634aee-baa5-48cc-ecd3-420859bb1e76" }, "source": [ "a = clean_text(\"Ala ma kota , kot pije mleko\")\n", "get_random_word_with_contexts(a, 2)" ], "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('kota', ['ala', 'ma', 'kot', 'pije'])" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5cgusynnIQa7", "outputId": "8928fb1a-244a-4afa-d98b-8b2b1e790b77" }, "source": [ "train_texts = load_train_data(\"drive/MyDrive/train.tsv\")" ], "execution_count": 16, "outputs": [ { "output_type": "stream", "text": [ "Loaded 107471 texts from train_set.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "qvOgmYr1KM10" }, "source": [ "corpus = WordCorpus(texts=train_texts, min_count=20, min_word_len=5)" ], "execution_count": 17, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "u56IlKZuju9d", "outputId": "c3caf546-0cfe-4458-e710-a9d8b13f2b21" }, "source": [ "len(corpus.corpus)" ], "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "111418" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "code", "metadata": { "id": "0GO307zuHYLm" }, "source": [ "def remove_words_outside_corpus_and_encode(text, corpus):\n", " return [corpus.get_word_idx(token) for token in text if token in corpus.corpus]" ], "execution_count": 19, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zFlfsgR3IQqX" }, "source": [ "train_texts = [remove_words_outside_corpus_and_encode(text, corpus) for text in train_texts]" ], "execution_count": 20, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "NTOCBZssKwci" }, "source": [ "BATCH_SIZE = 96\n", "CONTEXT_SIZE = 15" ], "execution_count": 21, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "AtuP4xv7LF26" }, "source": [ "import time\n", "\n", "def get_batch(texts):\n", " X, y = [], []\n", " size = len(texts)\n", " for _ in range(BATCH_SIZE):\n", " word_idx = None\n", " while word_idx is None:\n", " text_idx = np.random.randint(size)\n", " text = texts[text_idx]\n", " word_idx, context = get_random_word_with_contexts(text, CONTEXT_SIZE)\n", " bow = corpus.get_bow(context, encode=False)\n", " X.append(bow)\n", " y.append(word_idx)\n", " r = (np.array(X) / (CONTEXT_SIZE * 2)).astype(np.float32), np.array(y).astype(np.int64)\n", " return r" ], "execution_count": 22, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "0LrDHSC-MF2g" }, "source": [ "model = LanguageNeuralModel(len(corpus.corpus), 250)" ], "execution_count": 31, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "JhswE-B4MMBw" }, "source": [ "model = model.to(device)" ], "execution_count": 32, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6jsfrzQMOJHs", "outputId": "9b73e201-62b9-4904-e5d0-cd3f256c4bba" }, "source": [ "model.train()" ], "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LanguageNeuralModel(\n", " (input): Linear(in_features=111418, out_features=250, bias=True)\n", " (hidden): Linear(in_features=250, out_features=250, bias=True)\n", " (output): Linear(in_features=250, out_features=111418, bias=True)\n", ")" ] }, "metadata": { "tags": [] }, "execution_count": 33 } ] }, { "cell_type": "code", "metadata": { "id": "wgVAoHOjOZEP" }, "source": [ "criterion = nn.CrossEntropyLoss().to(device)\n", "optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)" ], "execution_count": 34, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "32o0ZAtkOwzY" }, "source": [ "import tqdm" ], "execution_count": 35, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 817, "referenced_widgets": [ "9c76638985c94701a2d86bb525efc606", "67d9046a45cb462d91eb2dd14259a21b", "75d3a999f54647d69bad407b015b3de1", "68c7c73101124e039210b5077ac36d5b", "8e7cbdb93fb2490ea24f616db532648b", "ef225bfda78b40bd84dc81dd2b64a215", "43a31b9b301c454db0a359ce13172883", "633e30a42efd458c9b6899552f5473d1" ] }, "id": "0zYz4HDuO3mC", "outputId": "9b2f238c-8577-4e38-bcde-84251b3021c4" }, "source": [ "running_loss = 0.0\n", "\n", "for i in tqdm.tqdm_notebook(range(20000)):\n", " X, y = get_batch(train_texts)\n", " X, y = torch.from_numpy(X).to(device), torch.from_numpy(y).to(device)\n", "\n", " optimizer.zero_grad()\n", "\n", " outputs = model(X)\n", " loss = criterion(outputs, y)\n", "\n", " loss.backward()\n", " optimizer.step()\n", "\n", " running_loss += loss.item()\n", " if i % 500 == 499:\n", " torch.save(model.state_dict(), \"model.pth\")\n", " print('[%d, %5d] loss: %.3f' %\n", " (1, i + 1, running_loss / 500))\n", " running_loss = 0.0" ], "execution_count": 36, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9c76638985c94701a2d86bb525efc606", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "[1, 500] loss: 10.873\n", "[1, 1000] loss: 10.559\n", "[1, 1500] loss: 10.505\n", "[1, 2000] loss: 10.437\n", "[1, 2500] loss: 10.371\n", "[1, 3000] loss: 10.371\n", "[1, 3500] loss: 10.336\n", "[1, 4000] loss: 10.338\n", "[1, 4500] loss: 10.325\n", "[1, 5000] loss: 10.325\n", "[1, 5500] loss: 10.335\n", "[1, 6000] loss: 10.366\n", "[1, 6500] loss: 10.366\n", "[1, 7000] loss: 10.377\n", "[1, 7500] loss: 10.392\n", "[1, 8000] loss: 10.422\n", "[1, 8500] loss: 10.477\n", "[1, 9000] loss: 10.525\n", "[1, 9500] loss: 10.562\n", "[1, 10000] loss: 10.593\n", "[1, 10500] loss: 10.657\n", "[1, 11000] loss: 10.711\n", "[1, 11500] loss: 10.706\n", "[1, 12000] loss: 10.781\n", "[1, 12500] loss: 10.799\n", "[1, 13000] loss: 10.875\n", "[1, 13500] loss: 10.882\n", "[1, 14000] loss: 10.921\n", "[1, 14500] loss: 10.946\n", "[1, 15000] loss: 10.979\n", "[1, 15500] loss: 11.001\n", "[1, 16000] loss: 11.032\n", "[1, 16500] loss: 11.069\n", "[1, 17000] loss: 11.090\n", "[1, 17500] loss: 11.112\n", "[1, 18000] loss: 11.119\n", "[1, 18500] loss: 11.132\n", "[1, 19000] loss: 11.212\n", "[1, 19500] loss: 11.188\n", "[1, 20000] loss: 11.213\n", "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "Yoe-By2iQANV", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "db1aea22-e3d3-4023-b07c-31a1df139c7e" }, "source": [ "model.eval()" ], "execution_count": 37, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LanguageNeuralModel(\n", " (input): Linear(in_features=111418, out_features=250, bias=True)\n", " (hidden): Linear(in_features=250, out_features=250, bias=True)\n", " (output): Linear(in_features=250, out_features=111418, bias=True)\n", ")" ] }, "metadata": { "tags": [] }, "execution_count": 37 } ] }, { "cell_type": "code", "metadata": { "id": "LX9xmKXwQdd7" }, "source": [ "sets_to_eval = [\"drive/MyDrive/dev0/\", \"drive/MyDrive/dev1/\", \"drive/MyDrive/test/\"]" ], "execution_count": 50, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "OmPYkEsHQ_QL" }, "source": [ "def load_test_data(test_path, corpus):\n", " texts = []\n", " with open(test_path, \"r+\") as file:\n", " while True:\n", " line = file.readline()\n", " if not line:\n", " break\n", "\n", " _, _, left, right, *_ = line.split(\"\\t\")\n", " texts.append(\n", " (\n", " remove_words_outside_corpus_and_encode(clean_text(left), corpus),\n", " remove_words_outside_corpus_and_encode(clean_text(right), corpus)\n", " )\n", " )\n", " print(f\"Loaded {len(texts)} texts from train_set.\")\n", " return texts" ], "execution_count": 39, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "6j2QUhPWSXyL", "colab": { "base_uri": "https://localhost:8080/", "height": 320, "referenced_widgets": [ "1438b4d8e39948be9d41796cea1bf35e", "fa644d0ae27b4f4985bb537115ffa5e4", "7e966f59f546486595f8a18986a18b33", "cd063db3bf0a48909dbc59ce83417556", "43a63a37e4504b08bcef619d49a0c283", "e102711dd88e409abbac9b3658469811", "97d72b38ec5c4381b1e125030132739c", "b603a738fa0249a2b45e5a8c725f7385", "90ea9fd32df14868b7286d00193e65f5", "4f154dd9265f413ab629ed6683080abd", "7f7f5bdd0e5f4201a13baf81681e51ec", "9c49b3c0da8f4b13a7686525a180d873", "dadf258368454db4a3a5cb31d24d6217", "735dc0020f43400a8189f14549c3d259", "361e57bbb4e34a57ad67dff4b0d50406", "6c39888179824232afe573d81aca2aca", "7fd3d8803eaf44dabe06fd7c8a1e3569", "4b6ba75be39846b891d6cb04b9110734", "72b4b1bb0ad14fc38a4e5a9d47ac4f27", "47cdc9134a934e5997e0274e0aa51ed5", "cda7970d110c43d5b9626a917445c272", "95942678b94f44eb954b6e93295f54c1", "23d2f634959c4211b44bda5de65860e0", "2b7d89e4271f4722ae8aab078f55e21c" ] }, "outputId": "3a93de12-bb84-43e9-e26b-a6c92d789a77" }, "source": [ "words = list(corpus.corpus)\n", "\n", "with torch.no_grad():\n", " for path in sets_to_eval:\n", " data = load_test_data(path + \"in.tsv\", corpus)\n", " results = []\n", " batch = []\n", " for left, right in tqdm.tqdm_notebook(data):\n", " if len(batch) < BATCH_SIZE:\n", " context = left[-CONTEXT_SIZE:] + right[:CONTEXT_SIZE]\n", " context = corpus.get_bow(context, encode=False)\n", " batch.append(context)\n", " continue\n", " batch = (np.array(batch) / (2*CONTEXT_SIZE)).astype(np.float32)\n", " X = torch.from_numpy(batch).to(device)\n", " out = F.softmax(model(X)).tolist()[0]\n", "\n", " indexes = list(range(len(corpus.corpus)))\n", " indexes = sorted(indexes, key=lambda x: out[x], reverse=True)\n", "\n", " res = \"\"\n", " prob0 = 1.\n", " for idx in indexes[:10000]:\n", " prob0 -= out[idx]\n", " res += f\"{words[idx]}:{np.log(out[idx])} \"\n", " res += f\":{np.log(prob0)}\"\n", " results.append(res)\n", " batch = []\n", " with open(path + \"out.tsv\", \"w+\") as f:\n", " f.write(\"\\n\".join(results))" ], "execution_count": 54, "outputs": [ { "output_type": "stream", "text": [ "Loaded 19986 texts from train_set.\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:8: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n", " \n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1438b4d8e39948be9d41796cea1bf35e", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=19986.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n", " app.launch_new_instance()\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n", "Loaded 11628 texts from train_set.\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "90ea9fd32df14868b7286d00193e65f5", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=11628.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n", "Loaded 14132 texts from train_set.\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7fd3d8803eaf44dabe06fd7c8a1e3569", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=14132.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "ZKLc8SZLt171" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }