{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Copy of HowDoI-NLP.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "b9f3692aa5394af38fbf8701f9d1a2f5": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_20cc5addfd6a404abd07e3e7410604bf", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_193cdcb227f4436ab295a1d03c935bc7", "IPY_MODEL_e3736ea8f8f1433fa58a2c7cefbc9d43" ] } }, "20cc5addfd6a404abd07e3e7410604bf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "193cdcb227f4436ab295a1d03c935bc7": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_0214896e38b7475b9810990cd98bcfbe", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 231508, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 231508, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_9d6b675edcf94469a3665d8ebd699cd6" } }, "e3736ea8f8f1433fa58a2c7cefbc9d43": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_2a6a7cec7ce9445a854b6378fe5c7d69", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 232k/232k [00:00<00:00, 885kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_af203886f4db407f92d990d4d421c207" } }, "0214896e38b7475b9810990cd98bcfbe": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "9d6b675edcf94469a3665d8ebd699cd6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "2a6a7cec7ce9445a854b6378fe5c7d69": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "af203886f4db407f92d990d4d421c207": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "89764229f6bb4419927cd6499b252cc1": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_489bcf4ae407408fa0b4f18193277f3d", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_c907be1df1ef42cda9d7a45ce3ae50ff", "IPY_MODEL_1f0711de1dde4e12b0ff8f7f11f38e5d" ] } }, "489bcf4ae407408fa0b4f18193277f3d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "c907be1df1ef42cda9d7a45ce3ae50ff": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_420b462bf1dc498082e1c341e9649e34", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 466062, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 466062, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_e47fef1e664a4a608cd595eb476b84d0" } }, "1f0711de1dde4e12b0ff8f7f11f38e5d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_fc1006302b0741b18c5719d6b46b508b", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 466k/466k [00:00<00:00, 1.41MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_20700e5b87dd4845b9d451a4e3a81794" } }, "420b462bf1dc498082e1c341e9649e34": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "e47fef1e664a4a608cd595eb476b84d0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "fc1006302b0741b18c5719d6b46b508b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "20700e5b87dd4845b9d451a4e3a81794": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "723f45572f9845f7a5a43d028acabd37": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_fd385e56155444c6ba0b70f461b014ef", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_d032b636ac73444ea8e13ec9ee1866d8", "IPY_MODEL_f6501a7bc274434eae206e1e1fb2e00a" ] } }, "fd385e56155444c6ba0b70f461b014ef": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "d032b636ac73444ea8e13ec9ee1866d8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_7b3318ea2d8e49e186f47c5792cbe30b", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 28, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 28, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_e9a8d590fca842859822b4cad55b1a5f" } }, "f6501a7bc274434eae206e1e1fb2e00a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_f9924cd4fc98461c82148ac3a38a845d", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 28.0/28.0 [00:00<00:00, 673B/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_fd9e3dfc77e243ecb7f6257b6700e728" } }, "7b3318ea2d8e49e186f47c5792cbe30b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "e9a8d590fca842859822b4cad55b1a5f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "f9924cd4fc98461c82148ac3a38a845d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "fd9e3dfc77e243ecb7f6257b6700e728": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "7a54279586bb4c87a52746f605f7b3e2": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_6a7acabfc3f24d3395c0cc7cde68a43a", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_b9c6899ed3dd489d85b201b51a6f35c3", "IPY_MODEL_b413ba22912c4149aeea6824ccbc90c1" ] } }, "6a7acabfc3f24d3395c0cc7cde68a43a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "b9c6899ed3dd489d85b201b51a6f35c3": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_6151969c55174665964db6f97bd18884", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 442, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 442, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_f97dbff45ea347f380b3d3e2bf9b60a5" } }, "b413ba22912c4149aeea6824ccbc90c1": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_6bc47beb071c4884a4ddb8a7dd669222", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 442/442 [00:00<00:00, 12.0kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_6acde23812d44176829ce106ea626f09" } }, "6151969c55174665964db6f97bd18884": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "f97dbff45ea347f380b3d3e2bf9b60a5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "6bc47beb071c4884a4ddb8a7dd669222": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "6acde23812d44176829ce106ea626f09": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "6e48979f9d49495da15e278aafd97a8b": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_73a609c59da744088c8def134a0acebc", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_3e978968f8814d4e9cb0cce52fe25f53", "IPY_MODEL_5b91aa38af194763b33168e24eb80b91" ] } }, "73a609c59da744088c8def134a0acebc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "3e978968f8814d4e9cb0cce52fe25f53": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_27cc2164a06a40a0b6e093bd1f642fce", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 267967963, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 267967963, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_1b013fd1c8ce45c992ed4dba0bcf2393" } }, "5b91aa38af194763b33168e24eb80b91": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_daedb950ac444ef3aa764b29f3e84052", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 268M/268M [00:05<00:00, 50.3MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_579a1b0eff7b425a9f22a18506fb7033" } }, "27cc2164a06a40a0b6e093bd1f642fce": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "1b013fd1c8ce45c992ed4dba0bcf2393": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "daedb950ac444ef3aa764b29f3e84052": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "579a1b0eff7b425a9f22a18506fb7033": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "co5T4uL_pXfO" }, "source": [ "## Fetch data from disk" ] }, { "cell_type": "code", "metadata": { "id": "Pjc-a5jWpdzy", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "192951ce-5610-4f4b-b7eb-ef9fb6c9ece7" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Mounted at /content/drive\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "VfZhymMbzuSg", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "41cf4baf-003f-4b50-9e65-5be0abce2a5c" }, "source": [ "!pip install transformers" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Collecting transformers\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)\n", "\u001b[K |████████████████████████████████| 2.2MB 9.4MB/s \n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.41.1)\n", "Collecting sacremoses\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)\n", "\u001b[K |████████████████████████████████| 870kB 41.6MB/s \n", "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)\n", "\u001b[K |████████████████████████████████| 3.3MB 42.0MB/s \n", "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (20.9)\n", "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers) (3.8.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2020.12.5)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.1)\n", "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.7.4.3)\n", "Building wheels for collected packages: sacremoses\n", " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=8188b4a1fe7b93d2ceab6b8c4c244e549efa60bdfd85aed0aa3020e99990acc0\n", " Stored in directory: /root/.cache/pip/wheels/3e/fb/c0/13ab4d63d537658f448366744654323077c4d90069b6512f3c\n", "Successfully built sacremoses\n", "Installing collected packages: sacremoses, tokenizers, transformers\n", "Successfully installed sacremoses-0.0.44 tokenizers-0.10.2 transformers-4.5.0\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "lLppqXoRSjeO" }, "source": [ "## Data Prep" ] }, { "cell_type": "code", "metadata": { "id": "VbcL94ugTLQ7" }, "source": [ "import pandas as pd\n", "from torch.utils.data import Dataset, IterableDataset, DataLoader, get_worker_info\n", "from transformers import DistilBertTokenizerFast\n", "import math" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 210, "referenced_widgets": [ "b9f3692aa5394af38fbf8701f9d1a2f5", "20cc5addfd6a404abd07e3e7410604bf", "193cdcb227f4436ab295a1d03c935bc7", "e3736ea8f8f1433fa58a2c7cefbc9d43", "0214896e38b7475b9810990cd98bcfbe", "9d6b675edcf94469a3665d8ebd699cd6", "2a6a7cec7ce9445a854b6378fe5c7d69", "af203886f4db407f92d990d4d421c207", "89764229f6bb4419927cd6499b252cc1", "489bcf4ae407408fa0b4f18193277f3d", "c907be1df1ef42cda9d7a45ce3ae50ff", "1f0711de1dde4e12b0ff8f7f11f38e5d", "420b462bf1dc498082e1c341e9649e34", "e47fef1e664a4a608cd595eb476b84d0", "fc1006302b0741b18c5719d6b46b508b", "20700e5b87dd4845b9d451a4e3a81794", "723f45572f9845f7a5a43d028acabd37", "fd385e56155444c6ba0b70f461b014ef", "d032b636ac73444ea8e13ec9ee1866d8", "f6501a7bc274434eae206e1e1fb2e00a", "7b3318ea2d8e49e186f47c5792cbe30b", "e9a8d590fca842859822b4cad55b1a5f", "f9924cd4fc98461c82148ac3a38a845d", "fd9e3dfc77e243ecb7f6257b6700e728" ] }, "id": "nplakpGpWubB", "outputId": "e3a3a5eb-d8d7-48e3-d50b-55ae1c801a33" }, "source": [ "ALL_LANGS = [\"A# .NET\",\"A# (Axiom)\",\"A-0 System\",\"A+\",\"A++\",\"ABAP\",\"ABC\",\"ABC ALGOL\",\"ABLE\",\"ABSET\",\"ABSYS\",\"ACC\",\"Accent\",\"Ace DASL\",\"ACL2\",\"ACT-III\",\"Action!\",\"ActionScript\",\"Ada\",\"Adenine\",\"Agda\",\"Agilent VEE\",\"Agora\",\"AIMMS\",\"Alef\",\"ALF\",\"ALGOL 58\",\"ALGOL 60\",\"ALGOL 68\",\"ALGOL W\",\"Alice\",\"Alma-0\",\"AmbientTalk\",\"Amiga E\",\"AMOS\",\"AMPL\",\"APL\",\"App Inventor for Android's visual block language\",\"AppleScript\",\"Arc\",\"ARexx\",\"Argus\",\"AspectJ\",\"Assembly language\",\"ATS\",\"Ateji PX\",\"AutoHotkey\",\"Autocoder\",\"AutoIt\",\"AutoLISP / Visual LISP\",\"Averest\",\"AWK\",\"Axum\",\"B\",\"Babbage\",\"Bash\",\"BASIC\",\"bc\",\"BCPL\",\"BeanShell\",\"Batch (Windows/Dos)\",\"Bertrand\",\"BETA\",\"Bigwig\",\"Bistro\",\"BitC\",\"BLISS\",\"Blue\",\"Bon\",\"Boo\",\"Boomerang\",\"Bourne shell\",\"bash\",\"ksh\",\"BREW\",\"BPEL\",\"C\",\"C--\",\"C++\",\"C#\",\"C/AL\",\"Caché ObjectScript\",\"C Shell\",\"Caml\",\"Candle\",\"Cayenne\",\"CDuce\",\"Cecil\",\"Cel\",\"Cesil\",\"Ceylon\",\"CFEngine\",\"CFML\",\"Cg\",\"Ch\",\"Chapel\",\"CHAIN\",\"Charity\",\"Charm\",\"Chef\",\"CHILL\",\"CHIP-8\",\"chomski\",\"ChucK\",\"CICS\",\"Cilk\",\"CL\",\"Claire\",\"Clarion\",\"Clean\",\"Clipper\",\"CLIST\",\"Clojure\",\"CLU\",\"CMS-2\",\"COBOL\",\"Cobra\",\"CODE\",\"CoffeeScript\",\"Cola\",\"ColdC\",\"ColdFusion\",\"COMAL\",\"Combined Programming Language\",\"COMIT\",\"Common Intermediate Language\",\"Common Lisp\",\"COMPASS\",\"Component Pascal\",\"Constraint Handling Rules\",\"Converge\",\"Cool\",\"Coq\",\"Coral 66\",\"Corn\",\"CorVision\",\"COWSEL\",\"CPL\",\"csh\",\"CSP\",\"Csound\",\"CUDA\",\"Curl\",\"Curry\",\"Cyclone\",\"Cython\",\"D\",\"DASL\",\"DASL\",\"Dart\",\"DataFlex\",\"Datalog\",\"DATATRIEVE\",\"dBase\",\"dc\",\"DCL\",\"Deesel\",\"Delphi\",\"DinkC\",\"DIBOL\",\"Dog\",\"Draco\",\"DRAKON\",\"Dylan\",\"DYNAMO\",\"E\",\"E#\",\"Ease\",\"Easy PL/I\",\"Easy Programming Language\",\"EASYTRIEVE PLUS\",\"ECMAScript\",\"Edinburgh IMP\",\"EGL\",\"Eiffel\",\"ELAN\",\"Elixir\",\"Elm\",\"Emacs Lisp\",\"Emerald\",\"Epigram\",\"EPL\",\"Erlang\",\"es\",\"Escapade\",\"Escher\",\"ESPOL\",\"Esterel\",\"Etoys\",\"Euclid\",\"Euler\",\"Euphoria\",\"EusLisp Robot Programming Language\",\"CMS EXEC\",\"EXEC 2\",\"Executable UML\",\"F\",\"F#\",\"Factor\",\"Falcon\",\"Fancy\",\"Fantom\",\"FAUST\",\"Felix\",\"Ferite\",\"FFP\",\"Fjölnir\",\"FL\",\"Flavors\",\"Flex\",\"FLOW-MATIC\",\"FOCAL\",\"FOCUS\",\"FOIL\",\"FORMAC\",\"@Formula\",\"Forth\",\"Fortran\",\"Fortress\",\"FoxBase\",\"FoxPro\",\"FP\",\"FPr\",\"Franz Lisp\",\"Frege\",\"F-Script\",\"FSProg\",\"G\",\"Google Apps Script\",\"Game Maker Language\",\"GameMonkey Script\",\"GAMS\",\"GAP\",\"G-code\",\"Genie\",\"GDL\",\"Gibiane\",\"GJ\",\"GEORGE\",\"GLSL\",\"GNU E\",\"GM\",\"Go\",\"Go!\",\"GOAL\",\"Gödel\",\"Godiva\",\"GOM (Good Old Mad)\",\"Goo\",\"Gosu\",\"GOTRAN\",\"GPSS\",\"GraphTalk\",\"GRASS\",\"Groovy\",\"Hack (programming language)\",\"HAL/S\",\"Hamilton C shell\",\"Harbour\",\"Hartmann pipelines\",\"Haskell\",\"Haxe\",\"High Level Assembly\",\"HLSL\",\"Hop\",\"Hope\",\"Hugo\",\"Hume\",\"HyperTalk\",\"IBM Basic assembly language\",\"IBM HAScript\",\"IBM Informix-4GL\",\"IBM RPG\",\"ICI\",\"Icon\",\"Id\",\"IDL\",\"Idris\",\"IMP\",\"Inform\",\"Io\",\"Ioke\",\"IPL\",\"IPTSCRAE\",\"ISLISP\",\"ISPF\",\"ISWIM\",\"J\",\"J#\",\"J++\",\"JADE\",\"Jako\",\"JAL\",\"Janus\",\"JASS\",\"Java\",\"JavaScript\",\"JCL\",\"JEAN\",\"Join Java\",\"JOSS\",\"Joule\",\"JOVIAL\",\"Joy\",\"JScript\",\"JScript .NET\",\"JavaFX Script\",\"Julia\",\"Jython\",\"K\",\"Kaleidoscope\",\"Karel\",\"Karel++\",\"KEE\",\"Kixtart\",\"KIF\",\"Kojo\",\"Kotlin\",\"KRC\",\"KRL\",\"KUKA\",\"KRYPTON\",\"ksh\",\"L\",\"L# .NET\",\"LabVIEW\",\"Ladder\",\"Lagoona\",\"LANSA\",\"Lasso\",\"LaTeX\",\"Lava\",\"LC-3\",\"Leda\",\"Legoscript\",\"LIL\",\"LilyPond\",\"Limbo\",\"Limnor\",\"LINC\",\"Lingo\",\"Linoleum\",\"LIS\",\"LISA\",\"Lisaac\",\"Lisp\",\"Lite-C\",\"Lithe\",\"Little b\",\"Logo\",\"Logtalk\",\"LPC\",\"LSE\",\"LSL\",\"LiveCode\",\"LiveScript\",\"Lua\",\"Lucid\",\"Lustre\",\"LYaPAS\",\"Lynx\",\"M2001\",\"M4\",\"Machine code\",\"MAD\",\"MAD/I\",\"Magik\",\"Magma\",\"make\",\"Maple\",\"MAPPER\",\"MARK-IV\",\"Mary\",\"MASM Microsoft Assembly x86\",\"Mathematica\",\"MATLAB\",\"Maxima\",\"Macsyma\",\"Max\",\"MaxScript\",\"Maya (MEL)\",\"MDL\",\"Mercury\",\"Mesa\",\"Metacard\",\"Metafont\",\"MetaL\",\"Microcode\",\"MicroScript\",\"MIIS\",\"MillScript\",\"MIMIC\",\"Mirah\",\"Miranda\",\"MIVA Script\",\"ML\",\"Moby\",\"Model 204\",\"Modelica\",\"Modula\",\"Modula-2\",\"Modula-3\",\"Mohol\",\"MOO\",\"Mortran\",\"Mouse\",\"MPD\",\"CIL\",\"MSL\",\"MUMPS\",\"NASM\",\"NATURAL\",\"Napier88\",\"Neko\",\"Nemerle\",\"nesC\",\"NESL\",\"Net.Data\",\"NetLogo\",\"NetRexx\",\"NewLISP\",\"NEWP\",\"Newspeak\",\"NewtonScript\",\"NGL\",\"Nial\",\"Nice\",\"Nickle\",\"Nim\",\"NPL\",\"Not eXactly C\",\"Not Quite C\",\"NSIS\",\"Nu\",\"NWScript\",\"NXT-G\",\"o:XML\",\"Oak\",\"Oberon\",\"Obix\",\"OBJ2\",\"Object Lisp\",\"ObjectLOGO\",\"Object REXX\",\"Object Pascal\",\"Objective-C\",\"Objective-J\",\"Obliq\",\"Obol\",\"OCaml\",\"occam\",\"occam-π\",\"Octave\",\"OmniMark\",\"Onyx\",\"Opa\",\"Opal\",\"OpenCL\",\"OpenEdge ABL\",\"OPL\",\"OPS5\",\"OptimJ\",\"Orc\",\"ORCA/Modula-2\",\"Oriel\",\"Orwell\",\"Oxygene\",\"Oz\",\"P#\",\"ParaSail (programming language)\",\"PARI/GP\",\"Pascal\",\"Pawn\",\"PCASTL\",\"PCF\",\"PEARL\",\"PeopleCode\",\"Perl\",\"PDL\",\"PHP\",\"Phrogram\",\"Pico\",\"Picolisp\",\"Pict\",\"Pike\",\"PIKT\",\"PILOT\",\"Pipelines\",\"Pizza\",\"PL-11\",\"PL/0\",\"PL/B\",\"PL/C\",\"PL/I\",\"PL/M\",\"PL/P\",\"PL/SQL\",\"PL360\",\"PLANC\",\"Plankalkül\",\"Planner\",\"PLEX\",\"PLEXIL\",\"Plus\",\"POP-11\",\"PostScript\",\"PortablE\",\"Powerhouse\",\"PowerBuilder\",\"PowerShell\",\"PPL\",\"Processing\",\"Processing.js\",\"Prograph\",\"PROIV\",\"Prolog\",\"PROMAL\",\"Promela\",\"PROSE modeling language\",\"PROTEL\",\"ProvideX\",\"Pro*C\",\"Pure\",\"Python\",\"Q (equational programming language)\",\"Q (programming language from Kx Systems)\",\"Qalb\",\"QtScript\",\"QuakeC\",\"QPL\",\"R\",\"R++\",\"Racket\",\"RAPID\",\"Rapira\",\"Ratfiv\",\"Ratfor\",\"rc\",\"REBOL\",\"Red\",\"Redcode\",\"REFAL\",\"Reia\",\"Revolution\",\"rex\",\"REXX\",\"Rlab\",\"RobotC\",\"ROOP\",\"RPG\",\"RPL\",\"RSL\",\"RTL/2\",\"Ruby\",\"RuneScript\",\"Rust\",\"S\",\"S2\",\"S3\",\"S-Lang\",\"S-PLUS\",\"SA-C\",\"SabreTalk\",\"SAIL\",\"SALSA\",\"SAM76\",\"SAS\",\"SASL\",\"Sather\",\"Sawzall\",\"SBL\",\"Scala\",\"Scheme\",\"Scilab\",\"Scratch\",\"Script.NET\",\"Sed\",\"Seed7\",\"Self\",\"SenseTalk\",\"SequenceL\",\"SETL\",\"Shift Script\",\"SIMPOL\",\"SIGNAL\",\"SiMPLE\",\"SIMSCRIPT\",\"Simula\",\"Simulink\",\"SISAL\",\"SLIP\",\"SMALL\",\"Smalltalk\",\"Small Basic\",\"SML\",\"Snap!\",\"SNOBOL\",\"SPITBOL\",\"Snowball\",\"SOL\",\"Span\",\"SPARK\",\"Speedcode\",\"SPIN\",\"SP/k\",\"SPS\",\"Squeak\",\"Squirrel\",\"SR\",\"S/SL\",\"Stackless Python\",\"Starlogo\",\"Strand\",\"Stata\",\"Stateflow\",\"Subtext\",\"SuperCollider\",\"SuperTalk\",\"Swift (Apple programming language)\",\"Swift (parallel scripting language)\",\"SYMPL\",\"SyncCharts\",\"SystemVerilog\",\"T\",\"TACL\",\"TACPOL\",\"TADS\",\"TAL\",\"Tcl\",\"Tea\",\"TECO\",\"TELCOMP\",\"TeX\",\"TEX\",\"TIE\",\"Timber\",\"TMG\",\"Tom\",\"TOM\",\"Topspeed\",\"TPU\",\"Trac\",\"TTM\",\"T-SQL\",\"TTCN\",\"Turing\",\"TUTOR\",\"TXL\",\"TypeScript\",\"Turbo C++\",\"Ubercode\",\"UCSD Pascal\",\"Umple\",\"Unicon\",\"Uniface\",\"UNITY\",\"Unix shell\",\"UnrealScript\",\"Vala\",\"VBA\",\"VBScript\",\"Verilog\",\"VHDL\",\"Visual Basic\",\"Visual Basic .NET\",\"Visual DataFlex\",\"Visual DialogScript\",\"Visual Fortran\",\"Visual FoxPro\",\"Visual J++\",\"Visual J#\",\"Visual Objects\",\"Visual Prolog\",\"VSXu\",\"Vvvv\",\"WATFIV, WATFOR\",\"WebDNA\",\"WebQL\",\"Windows PowerShell\",\"Winbatch\",\"Wolfram\",\"Wyvern\",\"X++\",\"X#\",\"X10\",\"XBL\",\"XC\",\"XMOS architecture\",\"xHarbour\",\"XL\",\"Xojo\",\"XOTcl\",\"XPL\",\"XPL0\",\"XQuery\",\"XSB\",\"XSLT\",\"XPath\",\"Xtend\",\"Yorick\",\"YQL\",\"Z notation\",\"Zeno\",\"ZOPL\",\"ZPL\"]\n", "ALL_LANGS = list(map(lambda x: x.lower(), ALL_LANGS))\n", "ALL_LANGS_SET = set(ALL_LANGS)\n", "print(ALL_LANGS_SET)\n", "\n", "tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "{'k', 'gamemonkey script', 'prose modeling language', 'lua', 'x++', 'txl', 'joule', 'xotcl', 'opal', 'lite-c', 'algol 58', 'uniface', 'spin', 'icon', 'promela', 'g', 'chapel', 'nickle', 'nesc', 'boomerang', 'xl', 'l', 'averest', 'b', 'dataflex', 'datatrieve', 'smalltalk', 'mary', 'foxpro', 'prograph', 'moby', 'stackless python', 'assembly language', 'c/al', 'amiga e', 'caml', 'beanshell', 'rpg', 'ada', 'postscript', 'f#', 'snap!', 'iptscrae', 'dbase', 'ferite', 'ceylon', 'visual foxpro', 'redcode', 'quakec', 'mortran', 'kif', 'xpath', 'vbscript', 'goo', 'ace dasl', 'cg', 'lynx', 'sbl', 'yql', 'a+', 'csound', 'dibol', 'abc algol', 'elan', 'q (equational programming language)', 'spitbol', 'lansa', 'rapid', 'csp', 'octave', 'george', 'cil', 'bon', 'cfml', 'pipelines', 'karel++', 'charity', 'argus', 'arc', 'alice', 'a++', 'clu', 'elixir', 'simscript', 'lilypond', 'starlogo', 'pizza', 'pl/0', 'self', 'clojure', 'reia', 'masm microsoft assembly x86', 'maxscript', 'latex', 'ppl', 'tmg', 'fp', 'formac', 'squirrel', 'mirah', 'typescript', 'miis', 'able', 'turing', 'synccharts', 'salsa', 'cyclone', 'squeak', 'occam-π', 'turbo c++', 'teco', 't-sql', 'ratfiv', 'not exactly c', 'graphtalk', 'xquery', 'planc', 'powerhouse', 'ibm hascript', 'logo', 'e#', 'bliss', 'ttm', 'bash', 'krl', 'tads', 'c#', 'java', 'ubercode', 'simula', 'simulink', 'ops5', 'grass', 'sml', 'adenine', 'code', 'mapper', 'sp/k', 'alf', 'kaleidoscope', 'robotc', 'macsyma', 'bc', 'joss', 'portable', 'inform', 'lingo', 'pcastl', 'cool', 'drakon', 'high level assembly', 'vsxu', 'rebol', 'objective-j', 'zeno', 'algol 60', 'sed', 'ipl', 'o:xml', 'nxt-g', 'cms-2', 'phrogram', 'jade', 'proiv', 'ici', 'julia', 'leda', 'processing', 'autolisp / visual lisp', 'systemverilog', 'pure', 'swift (apple programming language)', 'fancy', 'strand', 'j#', 'orwell', 'pl/m', 'lithe', 'chill', 'tacpol', 'coldfusion', 'logtalk', 'modelica', 'prolog', 'scratch', 'algol 68', 'jako', 'not quite c', 'mouse', 'watfiv, watfor', 'mdl', 'shift script', 'j', 'jscript .net', 'vvvv', 'mohol', 'clipper', 'little b', 'modula', 'mimic', 'f', 'gödel', 'cobol', 'unity', 'nwscript', 'ratfor', 'supercollider', 'red', 'bitc', 'clarion', 'kotlin', 'arexx', 'pari/gp', 'obol', 'a-0 system', 'millscript', 'abc', 'newspeak', 'dcl', 'oriel', 'act-iii', 'godiva', 'limbo', 'joy', 'mathematica', 'delphi', 'idl', 'autohotkey', 'parasail (programming language)', 'magik', 'nim', 'ocaml', 'easy programming language', 'janus', 'goal', 'modula-3', 'coldc', 'gnu e', 'lse', 'oberon', 'genie', 'esterel', 'gotran', 'qpl', 'ateji px', 'hlsl', 'small basic', 'r', 'focal', 'cola', 'euphoria', 'acl2', 'brew', 'linc', 'orc', 'bourne shell', 'winbatch', 'coffeescript', 'stata', 'tal', 'mad/i', 'snowball', 'object rexx', 'visual prolog', 'pl/i', 'cics', 'cobra', 'haskell', 'visual j++', 'comit', 'cuda', 'snobol', 'lyapas', 'planner', 'curry', 'bigwig', 'javafx script', 'pop-11', 'visual dataflex', 'agora', 'egl', 'falcon', 'go', 'topspeed', 'hope', 'visual basic .net', 'hypertalk', 'accent', 'dynamo', 'ffp', 'natural', 'factor', 'ml', 'object lisp', 'tacl', 'caché objectscript', 'p#', 'sasl', 'tcl', 'mercury', 'fl', 'dog', 'simpol', 'amos', 'jass', 'epl', 'fantom', 'seed7', 'gpss', 'unicon', 'apl', 'pearl', 'abset', 'scala', 'neko', 'ladder', 'emacs lisp', 'stateflow', 'compass', 's3', 'aimms', 'lava', 'ucsd pascal', 'axum', 'visual objects', 'autoit', 'chain', 'a# .net', 'cel', 'setl', 'pl/p', 'candle', 'hume', 'awk', 'x10', 'nsis', 'combined programming language', 'max', 'mark-iv', 'gosu', 'jscript', 'sabretalk', 'io', 'sawzall', 'z notation', 'absys', 'easytrieve plus', 'visual basic', 'iswim', 'rc', 'scheme', 'orca/modula-2', 'timber', 'chip-8', 'python', 'clist', 'trac', 'sensetalk', 'revolution', 'sas', 'ngl', 'hartmann pipelines', 'simple', 'oxygene', 'obliq', 'euslisp robot programming language', 'kee', 'sail', 'coq', 'nial', 'sps', 'agilent vee', 'lc-3', 'pcf', 'bistro', 'ease', 'roop', 'lis', 'erlang', 'optimj', 'pawn', 's', 'join java', 's2', 'c', 'haxe', 'fortran', 'cfengine', 'boo', 'escher', 'executable uml', 'deesel', 'metafont', 'xsb', 'powershell', 'swift (parallel scripting language)', 'tutor', 'cecil', 'zpl', 's-plus', 'onyx', 'm2001', 'miranda', 'fjölnir', 'matlab', 'fsprog', 'objectlogo', 'e', 'pascal', 'pl/sql', 'plexil', 'moo', 'maple', 'yorick', 'abap', 'miva script', 'autocoder', 'hal/s', 'j++', 'imp', 'c++', 'ibm rpg', 'speedcode', 'tpu', 'visual j#', 'modula-2', 'xbl', 'pdl', 'franz lisp', 'gj', 'groovy', 'napier88', 'tex', 'flow-matic', 'xmos architecture', 'claire', 'php', 'lpc', 'lucid', 'gams', 'gom (good old mad)', 'g-code', 'msl', 'cowsel', 'ats', 'xharbour', 'foxbase', 'forth', 'oak', 'ruby', 'plankalkül', 'ampl', 'cms exec', 'labview', 'model 204', 'a# (axiom)', 'hamilton c shell', 'sequencel', 'peoplecode', 'ibm basic assembly language', 'c shell', 'sam76', 'unix shell', 'webdna', 'algol w', 'visual dialogscript', 'mpd', 'beta', 'nice', 'signal', 'dasl', 'tie', 'legoscript', 'limnor', 'ttcn', 's-lang', 'nesl', 'bpel', 'common intermediate language', 'flavors', 'edinburgh imp', 'csh', 'euler', 'm4', 'pico', 'vhdl', 'protel', 'sa-c', 'lustre', 'mad', 'obj2', 'common lisp', 'newtonscript', 'krc', 'etoys', 'opl', 'powerbuilder', 'racket', 'windows powershell', 'converge', 'nasm', 'xc', 'f-script', 'livecode', 'espol', 'sol', 'script.net', 'sisal', 'batch (windows/dos)', 'microcode', 'rex', 'refal', 'rlab', 'lagoona', 'lisaac', 'picolisp', 'providex', 'r++', 'machine code', 'cesil', 'vala', 'es', 'obix', 'xpl0', 'alef', 'cduce', 'newp', 'lisp', 'cayenne', 'constraint handling rules', 'sr', 'subtext', 'cilk', 'dc', 'epigram', 'jal', 'kixtart', 'cpl', 'opa', 'felix', '@formula', 'hugo', 'plex', 'gibiane', 'maya (mel)', 'zopl', 'jython', 's/sl', 'jean', 'qtscript', 'rapira', 'supertalk', 'nu', 'ioke', 'gm', 'clean', 'microscript', 'promal', 'harbour', 'webql', 'scilab', 'rsl', 'xojo', 'applescript', 'foil', 'jovial', 'corvision', 'oz', 'coral 66', 'eiffel', 'maxima', 'dylan', 'pl/b', 'component pascal', 'npl', 'draco', 'opencl', 'perl', 'krypton', 'pl-11', 'xpl', 'netrexx', 'game maker language', 'umple', 'kojo', 'lsl', 'faust', 'gdl', 'object pascal', 'idris', 'unrealscript', 'action!', \"app inventor for android's visual block language\", 'focus', 'rexx', 'glsl', 'metal', 'sather', 'newlisp', 'islisp', 'flex', 'ambienttalk', 'pro*c', 'kuka', 'charm', 'openedge abl', 'xslt', 'occam', 'telcomp', 'dinkc', 'dart', 'bertrand', 'ksh', 'c--', 'lasso', 'slip', 'ibm informix-4gl', 'chuck', 'd', 'go!', 'fortress', 'l# .net', 'comal', 'q (programming language from kx systems)', 'xtend', 't', 'livescript', 'pict', 'ispf', 'vba', 'alma-0', 'pike', 'curl', 'small', 'rust', 'mesa', 'rtl/2', 'hop', 'javascript', 'lil', 'exec 2', 'pikt', 'frege', 'qalb', 'bcpl', 'chomski', 'aspectj', 'rpl', 'fpr', 'plus', 'gap', 'tea', 'wyvern', 'cl', 'babbage', 'x#', 'ch', 'agda', 'cython', 'pl/c', 'elm', 'basic', 'nemerle', 'lisa', 'netlogo', 'euclid', 'pilot', 'tom', 'actionscript', 'processing.js', 'corn', 'metacard', 'sympl', 'magma', 'visual fortran', 'make', 'span', 'ecmascript', 'jcl', 'emerald', 'google apps script', 'spark', 'net.data', 'objective-c', 'blue', 'omnimark', 'mumps', 'runescript', 'verilog', 'karel', 'id', 'escapade', 'linoleum', 'easy pl/i', 'wolfram', 'datalog', 'chef', 'acc', 'pl360', 'hack (programming language)'}\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b9f3692aa5394af38fbf8701f9d1a2f5", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "89764229f6bb4419927cd6499b252cc1", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "723f45572f9845f7a5a43d028acabd37", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "gl4s2lbNNo_f" }, "source": [ "!cp ./drive/MyDrive/howdoi_train.csv ./\n", "!cp ./drive/MyDrive/howdoi_test.csv ./" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bPeuuANP_Kj_", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "93c5bd8c-45ed-47ad-fea6-15e6637dbb2b" }, "source": [ "# do lazy loading with h5py to save memory\n", "'''\n", "import h5py\n", "import numpy as np\n", "\n", "import subprocess\n", "train_path, test_path = \"./howdoi_train.csv\", \"./howdoi_test.csv\"\n", "h5_train_path, h5_test_path = \"./data_tr.h5\", \"./data_ts.h5\"\n", "\n", "# this is just a random large number, this size of data (short strings)\n", "# doesn't take much RAM, not even sure we have to read it in chunks at all\n", "chunksize = 1000 * 10000\n", "\n", "# hacky way of reading the length of the file without opening it\n", "lines_train = subprocess.check_output(['wc', '-l', train_path])\n", "lines_train = int(lines_train.split()[0])\n", "\n", "# h5 is a format you can read from without loading up the data in memory\n", "# so it's perfect for huge datasets\n", "\n", "# NOTE: this will take a minute or so\n", "with h5py.File(h5_train_path, 'w') as h5f:\n", " # use num_features if the csv file has no column header\n", " texts = h5f.create_dataset(\"text-train\",\n", " shape=(lines_train,),\n", " compression=None,\n", " dtype=h5py.string_dtype('utf-8'))\n", " labels = h5f.create_dataset(\"label-train\",\n", " shape=(lines_train,),\n", " compression=None,\n", " dtype=\"bool\")\n", "\n", " # read num_lines in chunks of size chunksize\n", " for i in range(1, lines_train, chunksize): \n", "\n", " df = pd.read_csv(\n", " train_path, \n", " header=None, # we ignore the header by starting the loop from row 1\n", " nrows=chunksize,\n", " skiprows=i\n", " )\n", " \n", " titles = df.values[:, -2]\n", "\n", " # you don't have to do this at this step, you could also just store\n", " # this as a string, like in the original csv\n", " has_tags = [\n", " len(set(str(t).lower().split('|')).intersection(ALL_LANGS_SET)) > 0\n", " for t in df.values[:, -1]\n", " ]\n", " print(has_tags)\n", "\n", " items_num = len(titles)\n", "\n", " # this fills in the current chunk of the h5 file\n", " texts[i-1:i-1+items_num] = titles\n", " labels[i-1:i-1+items_num] = has_tags\n", "\n", "# Create test set\n", "\n", "'''" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "IOPub data rate exceeded.\n", "The notebook server will temporarily stop sending output\n", "to the client in order to avoid crashing it.\n", "To change this limit, set the config variable\n", "`--NotebookApp.iopub_data_rate_limit`.\n", "\n", "Current values:\n", "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", "NotebookApp.rate_limit_window=3.0 (secs)\n", "\n", "IOPub data rate exceeded.\n", "The notebook server will temporarily stop sending output\n", "to the client in order to avoid crashing it.\n", "To change this limit, set the config variable\n", "`--NotebookApp.iopub_data_rate_limit`.\n", "\n", "Current values:\n", "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", "NotebookApp.rate_limit_window=3.0 (secs)\n", "\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "IslPgrNQig6P" }, "source": [ "import h5py\n", "import numpy as np\n", "\n", "import subprocess\n", "train_path, test_path = \"./howdoi_train.csv\", \"./howdoi_test.csv\"\n", "h5_train_path, h5_test_path = \"./data_tr.h5\", \"./data_ts.h5\"\n", "\n", "# this is just a random large number, this size of data (short strings)\n", "# doesn't take much RAM, not even sure we have to read it in chunks at all\n", "chunksize = 1000 * 10000\n", "\n", "# hacky way of reading the length of the file without opening it\n", "lines_train = subprocess.check_output(['wc', '-l', train_path])\n", "lines_train = int(lines_train.split()[0])\n", "\n", "df_train = pd.read_csv(train_path)\n", "df_test = pd.read_csv(test_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "iybXPZ2WoFJP" }, "source": [ "df_train['tags'] = df_train['tags'].map(lambda x:\n", " len(set(str(x).lower().split('|')).intersection(ALL_LANGS_SET)) > 0\n", " )\n", "df_test['tags'] = df_test['tags'].map(lambda x:\n", " len(set(str(x).lower().split('|')).intersection(ALL_LANGS_SET)) > 0\n", " )" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kA2tOyy2pyHa", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "6c4a5775-a332-4674-c08e-eddb0b4ee430" }, "source": [ "df_test.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Unnamed: 0.1titletags
01197140011971400Changing colors of shapes in HTML5 canvasjavascript|html|canvas|polygon
154337725433772Where to look for DB file after update-database?c#|.net|entity
289963048996304Graddle missing transitive dependencymaven|gradle|transitive-dependency
376482137648213laravel link does work but button does nottwitter-bootstrap|laravel
41412393814123938Elegant haskell case/error handling in sequent...haskell
\n", "
" ], "text/plain": [ " Unnamed: 0 ... tags\n", "0 11971400 ... javascript|html|canvas|polygon\n", "1 5433772 ... c#|.net|entity\n", "2 8996304 ... maven|gradle|transitive-dependency\n", "3 7648213 ... twitter-bootstrap|laravel\n", "4 14123938 ... haskell\n", "\n", "[5 rows x 4 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "FRqvd1muijVE" }, "source": [ "import torch" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "qjtjK2tH6hdn" }, "source": [ "class QueryDataset(Dataset):\n", " def __init__(self, filename, kind):\n", " self.titles = df_train['title']\n", " self.labels = df_train['tags']\n", "\n", " def __len__(self):\n", " return self.titles.shape[0]\n", "\n", " def __getitem__(self, i):\n", " # now the cool bit - read without loading the whole thing in memory!\n", " title = self.titles[i]\n", " label = self.labels[i].astype('bool')\n", " label = 1 if label else 0\n", " # encoded = tokenizer(title, truncation=True, padding=True)\n", " out = {'title': title, 'label': label}\n", " return out" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "c9OPoW4uivuu" }, "source": [ "def collate_fn(data):\n", " titles, labels = [v['title'] for v in data], [v['label'] for v in data]\n", " encoded = tokenizer(titles, truncation=True, padding=True)\n", " # for k,v in encoded.items():\n", " # print(len(v[0]))\n", " out = {k: torch.tensor(v) for k,v in encoded.items()}\n", " out['labels'] = torch.tensor(labels)\n", " return out" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DxL9pAGSXyJZ", "outputId": "4e8fa188-eed7-454f-8e20-914152bfbb27" }, "source": [ "trainset = QueryDataset(h5_train_path, 'train')\n", "trainloader = DataLoader(trainset, batch_size=256, num_workers=2, shuffle=True,\n", " collate_fn=collate_fn) # This seemingly redundant collate_fn param actually helps avoid a RuntimeError - https://github.com/pytorch/pytorch/issues/42654#issuecomment-706926806\n", "for i, y in enumerate(trainloader):\n", " print(y)\n", " break" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "{'input_ids': tensor([[ 101, 24357, 3746, ..., 0, 0, 0],\n", " [ 101, 10463, 5164, ..., 0, 0, 0],\n", " [ 101, 2129, 2000, ..., 0, 0, 0],\n", " ...,\n", " [ 101, 9585, 8011, ..., 0, 0, 0],\n", " [ 101, 2054, 2024, ..., 0, 0, 0],\n", " [ 101, 2129, 2064, ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, ..., 0, 0, 0],\n", " [1, 1, 1, ..., 0, 0, 0],\n", " [1, 1, 1, ..., 0, 0, 0],\n", " ...,\n", " [1, 1, 1, ..., 0, 0, 0],\n", " [1, 1, 1, ..., 0, 0, 0],\n", " [1, 1, 1, ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,\n", " 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,\n", " 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,\n", " 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,\n", " 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,\n", " 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,\n", " 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,\n", " 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,\n", " 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,\n", " 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,\n", " 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1])}\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "2nhCSTDlyC5m" }, "source": [ "## Model training code" ] }, { "cell_type": "code", "metadata": { "id": "2z7cHC1xyAZr" }, "source": [ "from transformers import DistilBertForSequenceClassification, AdamW" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "HmD6NW1FyZjM", "colab": { "base_uri": "https://localhost:8080/", "height": 237, "referenced_widgets": [ "7a54279586bb4c87a52746f605f7b3e2", "6a7acabfc3f24d3395c0cc7cde68a43a", "b9c6899ed3dd489d85b201b51a6f35c3", "b413ba22912c4149aeea6824ccbc90c1", "6151969c55174665964db6f97bd18884", "f97dbff45ea347f380b3d3e2bf9b60a5", "6bc47beb071c4884a4ddb8a7dd669222", "6acde23812d44176829ce106ea626f09", "6e48979f9d49495da15e278aafd97a8b", "73a609c59da744088c8def134a0acebc", "3e978968f8814d4e9cb0cce52fe25f53", "5b91aa38af194763b33168e24eb80b91", "27cc2164a06a40a0b6e093bd1f642fce", "1b013fd1c8ce45c992ed4dba0bcf2393", "daedb950ac444ef3aa764b29f3e84052", "579a1b0eff7b425a9f22a18506fb7033" ] }, "outputId": "7e390c52-f065-460c-dd08-8af04ee0af4e" }, "source": [ "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n", "\n", "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')\n", "model.to(device)\n", "model.train()\n", "\n", "optim = AdamW(model.parameters(), lr=5e-5)\n", "\n", "for epoch in range(3):\n", " for batch in trainloader:\n", " optim.zero_grad()\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['labels'].to(device)\n", " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", " loss = outputs[0]\n", " loss.backward()\n", " optim.step()\n", "\n", "model.eval()" ], "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7a54279586bb4c87a52746f605f7b3e2", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6e48979f9d49495da15e278aafd97a8b", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n", "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "mr_F0fiyzuy8" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }