{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Copy of HowDoI-NLP.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "b9f3692aa5394af38fbf8701f9d1a2f5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_20cc5addfd6a404abd07e3e7410604bf",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_193cdcb227f4436ab295a1d03c935bc7",
              "IPY_MODEL_e3736ea8f8f1433fa58a2c7cefbc9d43"
            ]
          }
        },
        "20cc5addfd6a404abd07e3e7410604bf": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "193cdcb227f4436ab295a1d03c935bc7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_0214896e38b7475b9810990cd98bcfbe",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 231508,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 231508,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_9d6b675edcf94469a3665d8ebd699cd6"
          }
        },
        "e3736ea8f8f1433fa58a2c7cefbc9d43": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_2a6a7cec7ce9445a854b6378fe5c7d69",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 232k/232k [00:00<00:00, 885kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_af203886f4db407f92d990d4d421c207"
          }
        },
        "0214896e38b7475b9810990cd98bcfbe": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "9d6b675edcf94469a3665d8ebd699cd6": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "2a6a7cec7ce9445a854b6378fe5c7d69": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "af203886f4db407f92d990d4d421c207": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "89764229f6bb4419927cd6499b252cc1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_489bcf4ae407408fa0b4f18193277f3d",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_c907be1df1ef42cda9d7a45ce3ae50ff",
              "IPY_MODEL_1f0711de1dde4e12b0ff8f7f11f38e5d"
            ]
          }
        },
        "489bcf4ae407408fa0b4f18193277f3d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "c907be1df1ef42cda9d7a45ce3ae50ff": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_420b462bf1dc498082e1c341e9649e34",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 466062,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 466062,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_e47fef1e664a4a608cd595eb476b84d0"
          }
        },
        "1f0711de1dde4e12b0ff8f7f11f38e5d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_fc1006302b0741b18c5719d6b46b508b",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 466k/466k [00:00<00:00, 1.41MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_20700e5b87dd4845b9d451a4e3a81794"
          }
        },
        "420b462bf1dc498082e1c341e9649e34": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "e47fef1e664a4a608cd595eb476b84d0": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "fc1006302b0741b18c5719d6b46b508b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "20700e5b87dd4845b9d451a4e3a81794": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "723f45572f9845f7a5a43d028acabd37": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_fd385e56155444c6ba0b70f461b014ef",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_d032b636ac73444ea8e13ec9ee1866d8",
              "IPY_MODEL_f6501a7bc274434eae206e1e1fb2e00a"
            ]
          }
        },
        "fd385e56155444c6ba0b70f461b014ef": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "d032b636ac73444ea8e13ec9ee1866d8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_7b3318ea2d8e49e186f47c5792cbe30b",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 28,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 28,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_e9a8d590fca842859822b4cad55b1a5f"
          }
        },
        "f6501a7bc274434eae206e1e1fb2e00a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_f9924cd4fc98461c82148ac3a38a845d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 28.0/28.0 [00:00<00:00, 673B/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_fd9e3dfc77e243ecb7f6257b6700e728"
          }
        },
        "7b3318ea2d8e49e186f47c5792cbe30b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "e9a8d590fca842859822b4cad55b1a5f": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "f9924cd4fc98461c82148ac3a38a845d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "fd9e3dfc77e243ecb7f6257b6700e728": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "7a54279586bb4c87a52746f605f7b3e2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_6a7acabfc3f24d3395c0cc7cde68a43a",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_b9c6899ed3dd489d85b201b51a6f35c3",
              "IPY_MODEL_b413ba22912c4149aeea6824ccbc90c1"
            ]
          }
        },
        "6a7acabfc3f24d3395c0cc7cde68a43a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "b9c6899ed3dd489d85b201b51a6f35c3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_6151969c55174665964db6f97bd18884",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 442,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 442,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_f97dbff45ea347f380b3d3e2bf9b60a5"
          }
        },
        "b413ba22912c4149aeea6824ccbc90c1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_6bc47beb071c4884a4ddb8a7dd669222",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 442/442 [00:00<00:00, 12.0kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_6acde23812d44176829ce106ea626f09"
          }
        },
        "6151969c55174665964db6f97bd18884": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "f97dbff45ea347f380b3d3e2bf9b60a5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6bc47beb071c4884a4ddb8a7dd669222": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "6acde23812d44176829ce106ea626f09": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "6e48979f9d49495da15e278aafd97a8b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_73a609c59da744088c8def134a0acebc",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_3e978968f8814d4e9cb0cce52fe25f53",
              "IPY_MODEL_5b91aa38af194763b33168e24eb80b91"
            ]
          }
        },
        "73a609c59da744088c8def134a0acebc": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "3e978968f8814d4e9cb0cce52fe25f53": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_27cc2164a06a40a0b6e093bd1f642fce",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 267967963,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 267967963,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_1b013fd1c8ce45c992ed4dba0bcf2393"
          }
        },
        "5b91aa38af194763b33168e24eb80b91": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_daedb950ac444ef3aa764b29f3e84052",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "​",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 268M/268M [00:05<00:00, 50.3MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_579a1b0eff7b425a9f22a18506fb7033"
          }
        },
        "27cc2164a06a40a0b6e093bd1f642fce": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "1b013fd1c8ce45c992ed4dba0bcf2393": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "daedb950ac444ef3aa764b29f3e84052": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "579a1b0eff7b425a9f22a18506fb7033": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "co5T4uL_pXfO"
      },
      "source": [
        "## Fetch data from disk"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Pjc-a5jWpdzy",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "192951ce-5610-4f4b-b7eb-ef9fb6c9ece7"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mounted at /content/drive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VfZhymMbzuSg",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "41cf4baf-003f-4b50-9e65-5be0abce2a5c"
      },
      "source": [
        "!pip install transformers"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting transformers\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)\n",
            "\u001b[K     |████████████████████████████████| 2.2MB 9.4MB/s \n",
            "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.41.1)\n",
            "Collecting sacremoses\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)\n",
            "\u001b[K     |████████████████████████████████| 870kB 41.6MB/s \n",
            "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)\n",
            "\u001b[K     |████████████████████████████████| 3.3MB 42.0MB/s \n",
            "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (20.9)\n",
            "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers) (3.8.1)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n",
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2020.12.5)\n",
            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.1)\n",
            "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.7.4.3)\n",
            "Building wheels for collected packages: sacremoses\n",
            "  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=8188b4a1fe7b93d2ceab6b8c4c244e549efa60bdfd85aed0aa3020e99990acc0\n",
            "  Stored in directory: /root/.cache/pip/wheels/3e/fb/c0/13ab4d63d537658f448366744654323077c4d90069b6512f3c\n",
            "Successfully built sacremoses\n",
            "Installing collected packages: sacremoses, tokenizers, transformers\n",
            "Successfully installed sacremoses-0.0.44 tokenizers-0.10.2 transformers-4.5.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lLppqXoRSjeO"
      },
      "source": [
        "## Data Prep"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VbcL94ugTLQ7"
      },
      "source": [
        "import pandas as pd\n",
        "from torch.utils.data import Dataset, IterableDataset, DataLoader, get_worker_info\n",
        "from transformers import DistilBertTokenizerFast\n",
        "import math"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 210,
          "referenced_widgets": [
            "b9f3692aa5394af38fbf8701f9d1a2f5",
            "20cc5addfd6a404abd07e3e7410604bf",
            "193cdcb227f4436ab295a1d03c935bc7",
            "e3736ea8f8f1433fa58a2c7cefbc9d43",
            "0214896e38b7475b9810990cd98bcfbe",
            "9d6b675edcf94469a3665d8ebd699cd6",
            "2a6a7cec7ce9445a854b6378fe5c7d69",
            "af203886f4db407f92d990d4d421c207",
            "89764229f6bb4419927cd6499b252cc1",
            "489bcf4ae407408fa0b4f18193277f3d",
            "c907be1df1ef42cda9d7a45ce3ae50ff",
            "1f0711de1dde4e12b0ff8f7f11f38e5d",
            "420b462bf1dc498082e1c341e9649e34",
            "e47fef1e664a4a608cd595eb476b84d0",
            "fc1006302b0741b18c5719d6b46b508b",
            "20700e5b87dd4845b9d451a4e3a81794",
            "723f45572f9845f7a5a43d028acabd37",
            "fd385e56155444c6ba0b70f461b014ef",
            "d032b636ac73444ea8e13ec9ee1866d8",
            "f6501a7bc274434eae206e1e1fb2e00a",
            "7b3318ea2d8e49e186f47c5792cbe30b",
            "e9a8d590fca842859822b4cad55b1a5f",
            "f9924cd4fc98461c82148ac3a38a845d",
            "fd9e3dfc77e243ecb7f6257b6700e728"
          ]
        },
        "id": "nplakpGpWubB",
        "outputId": "e3a3a5eb-d8d7-48e3-d50b-55ae1c801a33"
      },
      "source": [
        "ALL_LANGS = [\"A# .NET\",\"A# (Axiom)\",\"A-0 System\",\"A+\",\"A++\",\"ABAP\",\"ABC\",\"ABC ALGOL\",\"ABLE\",\"ABSET\",\"ABSYS\",\"ACC\",\"Accent\",\"Ace DASL\",\"ACL2\",\"ACT-III\",\"Action!\",\"ActionScript\",\"Ada\",\"Adenine\",\"Agda\",\"Agilent VEE\",\"Agora\",\"AIMMS\",\"Alef\",\"ALF\",\"ALGOL 58\",\"ALGOL 60\",\"ALGOL 68\",\"ALGOL W\",\"Alice\",\"Alma-0\",\"AmbientTalk\",\"Amiga E\",\"AMOS\",\"AMPL\",\"APL\",\"App Inventor for Android's visual block language\",\"AppleScript\",\"Arc\",\"ARexx\",\"Argus\",\"AspectJ\",\"Assembly language\",\"ATS\",\"Ateji PX\",\"AutoHotkey\",\"Autocoder\",\"AutoIt\",\"AutoLISP / Visual LISP\",\"Averest\",\"AWK\",\"Axum\",\"B\",\"Babbage\",\"Bash\",\"BASIC\",\"bc\",\"BCPL\",\"BeanShell\",\"Batch (Windows/Dos)\",\"Bertrand\",\"BETA\",\"Bigwig\",\"Bistro\",\"BitC\",\"BLISS\",\"Blue\",\"Bon\",\"Boo\",\"Boomerang\",\"Bourne shell\",\"bash\",\"ksh\",\"BREW\",\"BPEL\",\"C\",\"C--\",\"C++\",\"C#\",\"C/AL\",\"Caché ObjectScript\",\"C Shell\",\"Caml\",\"Candle\",\"Cayenne\",\"CDuce\",\"Cecil\",\"Cel\",\"Cesil\",\"Ceylon\",\"CFEngine\",\"CFML\",\"Cg\",\"Ch\",\"Chapel\",\"CHAIN\",\"Charity\",\"Charm\",\"Chef\",\"CHILL\",\"CHIP-8\",\"chomski\",\"ChucK\",\"CICS\",\"Cilk\",\"CL\",\"Claire\",\"Clarion\",\"Clean\",\"Clipper\",\"CLIST\",\"Clojure\",\"CLU\",\"CMS-2\",\"COBOL\",\"Cobra\",\"CODE\",\"CoffeeScript\",\"Cola\",\"ColdC\",\"ColdFusion\",\"COMAL\",\"Combined Programming Language\",\"COMIT\",\"Common Intermediate Language\",\"Common Lisp\",\"COMPASS\",\"Component Pascal\",\"Constraint Handling Rules\",\"Converge\",\"Cool\",\"Coq\",\"Coral 66\",\"Corn\",\"CorVision\",\"COWSEL\",\"CPL\",\"csh\",\"CSP\",\"Csound\",\"CUDA\",\"Curl\",\"Curry\",\"Cyclone\",\"Cython\",\"D\",\"DASL\",\"DASL\",\"Dart\",\"DataFlex\",\"Datalog\",\"DATATRIEVE\",\"dBase\",\"dc\",\"DCL\",\"Deesel\",\"Delphi\",\"DinkC\",\"DIBOL\",\"Dog\",\"Draco\",\"DRAKON\",\"Dylan\",\"DYNAMO\",\"E\",\"E#\",\"Ease\",\"Easy PL/I\",\"Easy Programming Language\",\"EASYTRIEVE PLUS\",\"ECMAScript\",\"Edinburgh IMP\",\"EGL\",\"Eiffel\",\"ELAN\",\"Elixir\",\"Elm\",\"Emacs Lisp\",\"Emerald\",\"Epigram\",\"EPL\",\"Erlang\",\"es\",\"Escapade\",\"Escher\",\"ESPOL\",\"Esterel\",\"Etoys\",\"Euclid\",\"Euler\",\"Euphoria\",\"EusLisp Robot Programming Language\",\"CMS EXEC\",\"EXEC 2\",\"Executable UML\",\"F\",\"F#\",\"Factor\",\"Falcon\",\"Fancy\",\"Fantom\",\"FAUST\",\"Felix\",\"Ferite\",\"FFP\",\"Fjölnir\",\"FL\",\"Flavors\",\"Flex\",\"FLOW-MATIC\",\"FOCAL\",\"FOCUS\",\"FOIL\",\"FORMAC\",\"@Formula\",\"Forth\",\"Fortran\",\"Fortress\",\"FoxBase\",\"FoxPro\",\"FP\",\"FPr\",\"Franz Lisp\",\"Frege\",\"F-Script\",\"FSProg\",\"G\",\"Google Apps Script\",\"Game Maker Language\",\"GameMonkey Script\",\"GAMS\",\"GAP\",\"G-code\",\"Genie\",\"GDL\",\"Gibiane\",\"GJ\",\"GEORGE\",\"GLSL\",\"GNU E\",\"GM\",\"Go\",\"Go!\",\"GOAL\",\"Gödel\",\"Godiva\",\"GOM (Good Old Mad)\",\"Goo\",\"Gosu\",\"GOTRAN\",\"GPSS\",\"GraphTalk\",\"GRASS\",\"Groovy\",\"Hack (programming language)\",\"HAL/S\",\"Hamilton C shell\",\"Harbour\",\"Hartmann pipelines\",\"Haskell\",\"Haxe\",\"High Level Assembly\",\"HLSL\",\"Hop\",\"Hope\",\"Hugo\",\"Hume\",\"HyperTalk\",\"IBM Basic assembly language\",\"IBM HAScript\",\"IBM Informix-4GL\",\"IBM RPG\",\"ICI\",\"Icon\",\"Id\",\"IDL\",\"Idris\",\"IMP\",\"Inform\",\"Io\",\"Ioke\",\"IPL\",\"IPTSCRAE\",\"ISLISP\",\"ISPF\",\"ISWIM\",\"J\",\"J#\",\"J++\",\"JADE\",\"Jako\",\"JAL\",\"Janus\",\"JASS\",\"Java\",\"JavaScript\",\"JCL\",\"JEAN\",\"Join Java\",\"JOSS\",\"Joule\",\"JOVIAL\",\"Joy\",\"JScript\",\"JScript .NET\",\"JavaFX Script\",\"Julia\",\"Jython\",\"K\",\"Kaleidoscope\",\"Karel\",\"Karel++\",\"KEE\",\"Kixtart\",\"KIF\",\"Kojo\",\"Kotlin\",\"KRC\",\"KRL\",\"KUKA\",\"KRYPTON\",\"ksh\",\"L\",\"L# .NET\",\"LabVIEW\",\"Ladder\",\"Lagoona\",\"LANSA\",\"Lasso\",\"LaTeX\",\"Lava\",\"LC-3\",\"Leda\",\"Legoscript\",\"LIL\",\"LilyPond\",\"Limbo\",\"Limnor\",\"LINC\",\"Lingo\",\"Linoleum\",\"LIS\",\"LISA\",\"Lisaac\",\"Lisp\",\"Lite-C\",\"Lithe\",\"Little b\",\"Logo\",\"Logtalk\",\"LPC\",\"LSE\",\"LSL\",\"LiveCode\",\"LiveScript\",\"Lua\",\"Lucid\",\"Lustre\",\"LYaPAS\",\"Lynx\",\"M2001\",\"M4\",\"Machine code\",\"MAD\",\"MAD/I\",\"Magik\",\"Magma\",\"make\",\"Maple\",\"MAPPER\",\"MARK-IV\",\"Mary\",\"MASM Microsoft Assembly x86\",\"Mathematica\",\"MATLAB\",\"Maxima\",\"Macsyma\",\"Max\",\"MaxScript\",\"Maya (MEL)\",\"MDL\",\"Mercury\",\"Mesa\",\"Metacard\",\"Metafont\",\"MetaL\",\"Microcode\",\"MicroScript\",\"MIIS\",\"MillScript\",\"MIMIC\",\"Mirah\",\"Miranda\",\"MIVA Script\",\"ML\",\"Moby\",\"Model 204\",\"Modelica\",\"Modula\",\"Modula-2\",\"Modula-3\",\"Mohol\",\"MOO\",\"Mortran\",\"Mouse\",\"MPD\",\"CIL\",\"MSL\",\"MUMPS\",\"NASM\",\"NATURAL\",\"Napier88\",\"Neko\",\"Nemerle\",\"nesC\",\"NESL\",\"Net.Data\",\"NetLogo\",\"NetRexx\",\"NewLISP\",\"NEWP\",\"Newspeak\",\"NewtonScript\",\"NGL\",\"Nial\",\"Nice\",\"Nickle\",\"Nim\",\"NPL\",\"Not eXactly C\",\"Not Quite C\",\"NSIS\",\"Nu\",\"NWScript\",\"NXT-G\",\"o:XML\",\"Oak\",\"Oberon\",\"Obix\",\"OBJ2\",\"Object Lisp\",\"ObjectLOGO\",\"Object REXX\",\"Object Pascal\",\"Objective-C\",\"Objective-J\",\"Obliq\",\"Obol\",\"OCaml\",\"occam\",\"occam-π\",\"Octave\",\"OmniMark\",\"Onyx\",\"Opa\",\"Opal\",\"OpenCL\",\"OpenEdge ABL\",\"OPL\",\"OPS5\",\"OptimJ\",\"Orc\",\"ORCA/Modula-2\",\"Oriel\",\"Orwell\",\"Oxygene\",\"Oz\",\"P#\",\"ParaSail (programming language)\",\"PARI/GP\",\"Pascal\",\"Pawn\",\"PCASTL\",\"PCF\",\"PEARL\",\"PeopleCode\",\"Perl\",\"PDL\",\"PHP\",\"Phrogram\",\"Pico\",\"Picolisp\",\"Pict\",\"Pike\",\"PIKT\",\"PILOT\",\"Pipelines\",\"Pizza\",\"PL-11\",\"PL/0\",\"PL/B\",\"PL/C\",\"PL/I\",\"PL/M\",\"PL/P\",\"PL/SQL\",\"PL360\",\"PLANC\",\"Plankalkül\",\"Planner\",\"PLEX\",\"PLEXIL\",\"Plus\",\"POP-11\",\"PostScript\",\"PortablE\",\"Powerhouse\",\"PowerBuilder\",\"PowerShell\",\"PPL\",\"Processing\",\"Processing.js\",\"Prograph\",\"PROIV\",\"Prolog\",\"PROMAL\",\"Promela\",\"PROSE modeling language\",\"PROTEL\",\"ProvideX\",\"Pro*C\",\"Pure\",\"Python\",\"Q (equational programming language)\",\"Q (programming language from Kx Systems)\",\"Qalb\",\"QtScript\",\"QuakeC\",\"QPL\",\"R\",\"R++\",\"Racket\",\"RAPID\",\"Rapira\",\"Ratfiv\",\"Ratfor\",\"rc\",\"REBOL\",\"Red\",\"Redcode\",\"REFAL\",\"Reia\",\"Revolution\",\"rex\",\"REXX\",\"Rlab\",\"RobotC\",\"ROOP\",\"RPG\",\"RPL\",\"RSL\",\"RTL/2\",\"Ruby\",\"RuneScript\",\"Rust\",\"S\",\"S2\",\"S3\",\"S-Lang\",\"S-PLUS\",\"SA-C\",\"SabreTalk\",\"SAIL\",\"SALSA\",\"SAM76\",\"SAS\",\"SASL\",\"Sather\",\"Sawzall\",\"SBL\",\"Scala\",\"Scheme\",\"Scilab\",\"Scratch\",\"Script.NET\",\"Sed\",\"Seed7\",\"Self\",\"SenseTalk\",\"SequenceL\",\"SETL\",\"Shift Script\",\"SIMPOL\",\"SIGNAL\",\"SiMPLE\",\"SIMSCRIPT\",\"Simula\",\"Simulink\",\"SISAL\",\"SLIP\",\"SMALL\",\"Smalltalk\",\"Small Basic\",\"SML\",\"Snap!\",\"SNOBOL\",\"SPITBOL\",\"Snowball\",\"SOL\",\"Span\",\"SPARK\",\"Speedcode\",\"SPIN\",\"SP/k\",\"SPS\",\"Squeak\",\"Squirrel\",\"SR\",\"S/SL\",\"Stackless Python\",\"Starlogo\",\"Strand\",\"Stata\",\"Stateflow\",\"Subtext\",\"SuperCollider\",\"SuperTalk\",\"Swift (Apple programming language)\",\"Swift (parallel scripting language)\",\"SYMPL\",\"SyncCharts\",\"SystemVerilog\",\"T\",\"TACL\",\"TACPOL\",\"TADS\",\"TAL\",\"Tcl\",\"Tea\",\"TECO\",\"TELCOMP\",\"TeX\",\"TEX\",\"TIE\",\"Timber\",\"TMG\",\"Tom\",\"TOM\",\"Topspeed\",\"TPU\",\"Trac\",\"TTM\",\"T-SQL\",\"TTCN\",\"Turing\",\"TUTOR\",\"TXL\",\"TypeScript\",\"Turbo C++\",\"Ubercode\",\"UCSD Pascal\",\"Umple\",\"Unicon\",\"Uniface\",\"UNITY\",\"Unix shell\",\"UnrealScript\",\"Vala\",\"VBA\",\"VBScript\",\"Verilog\",\"VHDL\",\"Visual Basic\",\"Visual Basic .NET\",\"Visual DataFlex\",\"Visual DialogScript\",\"Visual Fortran\",\"Visual FoxPro\",\"Visual J++\",\"Visual J#\",\"Visual Objects\",\"Visual Prolog\",\"VSXu\",\"Vvvv\",\"WATFIV, WATFOR\",\"WebDNA\",\"WebQL\",\"Windows PowerShell\",\"Winbatch\",\"Wolfram\",\"Wyvern\",\"X++\",\"X#\",\"X10\",\"XBL\",\"XC\",\"XMOS architecture\",\"xHarbour\",\"XL\",\"Xojo\",\"XOTcl\",\"XPL\",\"XPL0\",\"XQuery\",\"XSB\",\"XSLT\",\"XPath\",\"Xtend\",\"Yorick\",\"YQL\",\"Z notation\",\"Zeno\",\"ZOPL\",\"ZPL\"]\n",
        "ALL_LANGS = list(map(lambda x: x.lower(), ALL_LANGS))\n",
        "ALL_LANGS_SET = set(ALL_LANGS)\n",
        "print(ALL_LANGS_SET)\n",
        "\n",
        "tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{'k', 'gamemonkey script', 'prose modeling language', 'lua', 'x++', 'txl', 'joule', 'xotcl', 'opal', 'lite-c', 'algol 58', 'uniface', 'spin', 'icon', 'promela', 'g', 'chapel', 'nickle', 'nesc', 'boomerang', 'xl', 'l', 'averest', 'b', 'dataflex', 'datatrieve', 'smalltalk', 'mary', 'foxpro', 'prograph', 'moby', 'stackless python', 'assembly language', 'c/al', 'amiga e', 'caml', 'beanshell', 'rpg', 'ada', 'postscript', 'f#', 'snap!', 'iptscrae', 'dbase', 'ferite', 'ceylon', 'visual foxpro', 'redcode', 'quakec', 'mortran', 'kif', 'xpath', 'vbscript', 'goo', 'ace dasl', 'cg', 'lynx', 'sbl', 'yql', 'a+', 'csound', 'dibol', 'abc algol', 'elan', 'q (equational programming language)', 'spitbol', 'lansa', 'rapid', 'csp', 'octave', 'george', 'cil', 'bon', 'cfml', 'pipelines', 'karel++', 'charity', 'argus', 'arc', 'alice', 'a++', 'clu', 'elixir', 'simscript', 'lilypond', 'starlogo', 'pizza', 'pl/0', 'self', 'clojure', 'reia', 'masm microsoft assembly x86', 'maxscript', 'latex', 'ppl', 'tmg', 'fp', 'formac', 'squirrel', 'mirah', 'typescript', 'miis', 'able', 'turing', 'synccharts', 'salsa', 'cyclone', 'squeak', 'occam-π', 'turbo c++', 'teco', 't-sql', 'ratfiv', 'not exactly c', 'graphtalk', 'xquery', 'planc', 'powerhouse', 'ibm hascript', 'logo', 'e#', 'bliss', 'ttm', 'bash', 'krl', 'tads', 'c#', 'java', 'ubercode', 'simula', 'simulink', 'ops5', 'grass', 'sml', 'adenine', 'code', 'mapper', 'sp/k', 'alf', 'kaleidoscope', 'robotc', 'macsyma', 'bc', 'joss', 'portable', 'inform', 'lingo', 'pcastl', 'cool', 'drakon', 'high level assembly', 'vsxu', 'rebol', 'objective-j', 'zeno', 'algol 60', 'sed', 'ipl', 'o:xml', 'nxt-g', 'cms-2', 'phrogram', 'jade', 'proiv', 'ici', 'julia', 'leda', 'processing', 'autolisp / visual lisp', 'systemverilog', 'pure', 'swift (apple programming language)', 'fancy', 'strand', 'j#', 'orwell', 'pl/m', 'lithe', 'chill', 'tacpol', 'coldfusion', 'logtalk', 'modelica', 'prolog', 'scratch', 'algol 68', 'jako', 'not quite c', 'mouse', 'watfiv, watfor', 'mdl', 'shift script', 'j', 'jscript .net', 'vvvv', 'mohol', 'clipper', 'little b', 'modula', 'mimic', 'f', 'gödel', 'cobol', 'unity', 'nwscript', 'ratfor', 'supercollider', 'red', 'bitc', 'clarion', 'kotlin', 'arexx', 'pari/gp', 'obol', 'a-0 system', 'millscript', 'abc', 'newspeak', 'dcl', 'oriel', 'act-iii', 'godiva', 'limbo', 'joy', 'mathematica', 'delphi', 'idl', 'autohotkey', 'parasail (programming language)', 'magik', 'nim', 'ocaml', 'easy programming language', 'janus', 'goal', 'modula-3', 'coldc', 'gnu e', 'lse', 'oberon', 'genie', 'esterel', 'gotran', 'qpl', 'ateji px', 'hlsl', 'small basic', 'r', 'focal', 'cola', 'euphoria', 'acl2', 'brew', 'linc', 'orc', 'bourne shell', 'winbatch', 'coffeescript', 'stata', 'tal', 'mad/i', 'snowball', 'object rexx', 'visual prolog', 'pl/i', 'cics', 'cobra', 'haskell', 'visual j++', 'comit', 'cuda', 'snobol', 'lyapas', 'planner', 'curry', 'bigwig', 'javafx script', 'pop-11', 'visual dataflex', 'agora', 'egl', 'falcon', 'go', 'topspeed', 'hope', 'visual basic .net', 'hypertalk', 'accent', 'dynamo', 'ffp', 'natural', 'factor', 'ml', 'object lisp', 'tacl', 'caché objectscript', 'p#', 'sasl', 'tcl', 'mercury', 'fl', 'dog', 'simpol', 'amos', 'jass', 'epl', 'fantom', 'seed7', 'gpss', 'unicon', 'apl', 'pearl', 'abset', 'scala', 'neko', 'ladder', 'emacs lisp', 'stateflow', 'compass', 's3', 'aimms', 'lava', 'ucsd pascal', 'axum', 'visual objects', 'autoit', 'chain', 'a# .net', 'cel', 'setl', 'pl/p', 'candle', 'hume', 'awk', 'x10', 'nsis', 'combined programming language', 'max', 'mark-iv', 'gosu', 'jscript', 'sabretalk', 'io', 'sawzall', 'z notation', 'absys', 'easytrieve plus', 'visual basic', 'iswim', 'rc', 'scheme', 'orca/modula-2', 'timber', 'chip-8', 'python', 'clist', 'trac', 'sensetalk', 'revolution', 'sas', 'ngl', 'hartmann pipelines', 'simple', 'oxygene', 'obliq', 'euslisp robot programming language', 'kee', 'sail', 'coq', 'nial', 'sps', 'agilent vee', 'lc-3', 'pcf', 'bistro', 'ease', 'roop', 'lis', 'erlang', 'optimj', 'pawn', 's', 'join java', 's2', 'c', 'haxe', 'fortran', 'cfengine', 'boo', 'escher', 'executable uml', 'deesel', 'metafont', 'xsb', 'powershell', 'swift (parallel scripting language)', 'tutor', 'cecil', 'zpl', 's-plus', 'onyx', 'm2001', 'miranda', 'fjölnir', 'matlab', 'fsprog', 'objectlogo', 'e', 'pascal', 'pl/sql', 'plexil', 'moo', 'maple', 'yorick', 'abap', 'miva script', 'autocoder', 'hal/s', 'j++', 'imp', 'c++', 'ibm rpg', 'speedcode', 'tpu', 'visual j#', 'modula-2', 'xbl', 'pdl', 'franz lisp', 'gj', 'groovy', 'napier88', 'tex', 'flow-matic', 'xmos architecture', 'claire', 'php', 'lpc', 'lucid', 'gams', 'gom (good old mad)', 'g-code', 'msl', 'cowsel', 'ats', 'xharbour', 'foxbase', 'forth', 'oak', 'ruby', 'plankalkül', 'ampl', 'cms exec', 'labview', 'model 204', 'a# (axiom)', 'hamilton c shell', 'sequencel', 'peoplecode', 'ibm basic assembly language', 'c shell', 'sam76', 'unix shell', 'webdna', 'algol w', 'visual dialogscript', 'mpd', 'beta', 'nice', 'signal', 'dasl', 'tie', 'legoscript', 'limnor', 'ttcn', 's-lang', 'nesl', 'bpel', 'common intermediate language', 'flavors', 'edinburgh imp', 'csh', 'euler', 'm4', 'pico', 'vhdl', 'protel', 'sa-c', 'lustre', 'mad', 'obj2', 'common lisp', 'newtonscript', 'krc', 'etoys', 'opl', 'powerbuilder', 'racket', 'windows powershell', 'converge', 'nasm', 'xc', 'f-script', 'livecode', 'espol', 'sol', 'script.net', 'sisal', 'batch (windows/dos)', 'microcode', 'rex', 'refal', 'rlab', 'lagoona', 'lisaac', 'picolisp', 'providex', 'r++', 'machine code', 'cesil', 'vala', 'es', 'obix', 'xpl0', 'alef', 'cduce', 'newp', 'lisp', 'cayenne', 'constraint handling rules', 'sr', 'subtext', 'cilk', 'dc', 'epigram', 'jal', 'kixtart', 'cpl', 'opa', 'felix', '@formula', 'hugo', 'plex', 'gibiane', 'maya (mel)', 'zopl', 'jython', 's/sl', 'jean', 'qtscript', 'rapira', 'supertalk', 'nu', 'ioke', 'gm', 'clean', 'microscript', 'promal', 'harbour', 'webql', 'scilab', 'rsl', 'xojo', 'applescript', 'foil', 'jovial', 'corvision', 'oz', 'coral 66', 'eiffel', 'maxima', 'dylan', 'pl/b', 'component pascal', 'npl', 'draco', 'opencl', 'perl', 'krypton', 'pl-11', 'xpl', 'netrexx', 'game maker language', 'umple', 'kojo', 'lsl', 'faust', 'gdl', 'object pascal', 'idris', 'unrealscript', 'action!', \"app inventor for android's visual block language\", 'focus', 'rexx', 'glsl', 'metal', 'sather', 'newlisp', 'islisp', 'flex', 'ambienttalk', 'pro*c', 'kuka', 'charm', 'openedge abl', 'xslt', 'occam', 'telcomp', 'dinkc', 'dart', 'bertrand', 'ksh', 'c--', 'lasso', 'slip', 'ibm informix-4gl', 'chuck', 'd', 'go!', 'fortress', 'l# .net', 'comal', 'q (programming language from kx systems)', 'xtend', 't', 'livescript', 'pict', 'ispf', 'vba', 'alma-0', 'pike', 'curl', 'small', 'rust', 'mesa', 'rtl/2', 'hop', 'javascript', 'lil', 'exec 2', 'pikt', 'frege', 'qalb', 'bcpl', 'chomski', 'aspectj', 'rpl', 'fpr', 'plus', 'gap', 'tea', 'wyvern', 'cl', 'babbage', 'x#', 'ch', 'agda', 'cython', 'pl/c', 'elm', 'basic', 'nemerle', 'lisa', 'netlogo', 'euclid', 'pilot', 'tom', 'actionscript', 'processing.js', 'corn', 'metacard', 'sympl', 'magma', 'visual fortran', 'make', 'span', 'ecmascript', 'jcl', 'emerald', 'google apps script', 'spark', 'net.data', 'objective-c', 'blue', 'omnimark', 'mumps', 'runescript', 'verilog', 'karel', 'id', 'escapade', 'linoleum', 'easy pl/i', 'wolfram', 'datalog', 'chef', 'acc', 'pl360', 'hack (programming language)'}\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "b9f3692aa5394af38fbf8701f9d1a2f5",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "89764229f6bb4419927cd6499b252cc1",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "723f45572f9845f7a5a43d028acabd37",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gl4s2lbNNo_f"
      },
      "source": [
        "!cp ./drive/MyDrive/howdoi_train.csv ./\n",
        "!cp ./drive/MyDrive/howdoi_test.csv ./"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bPeuuANP_Kj_",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "93c5bd8c-45ed-47ad-fea6-15e6637dbb2b"
      },
      "source": [
        "# do lazy loading with h5py to save memory\n",
        "'''\n",
        "import h5py\n",
        "import numpy as np\n",
        "\n",
        "import subprocess\n",
        "train_path, test_path = \"./howdoi_train.csv\",  \"./howdoi_test.csv\"\n",
        "h5_train_path, h5_test_path = \"./data_tr.h5\", \"./data_ts.h5\"\n",
        "\n",
        "# this is just a random large number, this size of data (short strings)\n",
        "#   doesn't take much RAM, not even sure we have to read it in chunks at all\n",
        "chunksize = 1000 * 10000\n",
        "\n",
        "# hacky way of reading the length of the file without opening it\n",
        "lines_train = subprocess.check_output(['wc', '-l', train_path])\n",
        "lines_train = int(lines_train.split()[0])\n",
        "\n",
        "# h5 is a format you can read from without loading up the data in memory\n",
        "#   so it's perfect for huge datasets\n",
        "\n",
        "# NOTE: this will take a minute or so\n",
        "with h5py.File(h5_train_path, 'w') as h5f:\n",
        "    # use num_features if the csv file has no column header\n",
        "    texts = h5f.create_dataset(\"text-train\",\n",
        "                               shape=(lines_train,),\n",
        "                               compression=None,\n",
        "                               dtype=h5py.string_dtype('utf-8'))\n",
        "    labels = h5f.create_dataset(\"label-train\",\n",
        "                               shape=(lines_train,),\n",
        "                               compression=None,\n",
        "                               dtype=\"bool\")\n",
        "\n",
        "    # read num_lines in chunks of size chunksize\n",
        "    for i in range(1, lines_train, chunksize):  \n",
        "\n",
        "        df = pd.read_csv(\n",
        "          train_path,  \n",
        "          header=None, # we ignore the header by starting the loop from row 1\n",
        "          nrows=chunksize,\n",
        "          skiprows=i\n",
        "        )\n",
        "        \n",
        "        titles = df.values[:, -2]\n",
        "\n",
        "        # you don't have to do this at this step, you could also just store\n",
        "        #   this as a string, like in the original csv\n",
        "        has_tags = [\n",
        "          len(set(str(t).lower().split('|')).intersection(ALL_LANGS_SET)) > 0\n",
        "          for t in df.values[:, -1]\n",
        "        ]\n",
        "        print(has_tags)\n",
        "\n",
        "        items_num = len(titles)\n",
        "\n",
        "        # this fills in the current chunk of the h5 file\n",
        "        texts[i-1:i-1+items_num] = titles\n",
        "        labels[i-1:i-1+items_num] = has_tags\n",
        "\n",
        "# Create test set\n",
        "\n",
        "'''"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "IOPub data rate exceeded.\n",
            "The notebook server will temporarily stop sending output\n",
            "to the client in order to avoid crashing it.\n",
            "To change this limit, set the config variable\n",
            "`--NotebookApp.iopub_data_rate_limit`.\n",
            "\n",
            "Current values:\n",
            "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
            "NotebookApp.rate_limit_window=3.0 (secs)\n",
            "\n",
            "IOPub data rate exceeded.\n",
            "The notebook server will temporarily stop sending output\n",
            "to the client in order to avoid crashing it.\n",
            "To change this limit, set the config variable\n",
            "`--NotebookApp.iopub_data_rate_limit`.\n",
            "\n",
            "Current values:\n",
            "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n",
            "NotebookApp.rate_limit_window=3.0 (secs)\n",
            "\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IslPgrNQig6P"
      },
      "source": [
        "import h5py\n",
        "import numpy as np\n",
        "\n",
        "import subprocess\n",
        "train_path, test_path = \"./howdoi_train.csv\",  \"./howdoi_test.csv\"\n",
        "h5_train_path, h5_test_path = \"./data_tr.h5\", \"./data_ts.h5\"\n",
        "\n",
        "# this is just a random large number, this size of data (short strings)\n",
        "#   doesn't take much RAM, not even sure we have to read it in chunks at all\n",
        "chunksize = 1000 * 10000\n",
        "\n",
        "# hacky way of reading the length of the file without opening it\n",
        "lines_train = subprocess.check_output(['wc', '-l', train_path])\n",
        "lines_train = int(lines_train.split()[0])\n",
        "\n",
        "df_train = pd.read_csv(train_path)\n",
        "df_test = pd.read_csv(test_path)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iybXPZ2WoFJP"
      },
      "source": [
        "df_train['tags'] = df_train['tags'].map(lambda x:\n",
        "          len(set(str(x).lower().split('|')).intersection(ALL_LANGS_SET)) > 0\n",
        "        )\n",
        "df_test['tags'] = df_test['tags'].map(lambda x:\n",
        "          len(set(str(x).lower().split('|')).intersection(ALL_LANGS_SET)) > 0\n",
        "        )"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kA2tOyy2pyHa",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "outputId": "6c4a5775-a332-4674-c08e-eddb0b4ee430"
      },
      "source": [
        "df_test.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Unnamed: 0</th>\n",
              "      <th>Unnamed: 0.1</th>\n",
              "      <th>title</th>\n",
              "      <th>tags</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>11971400</td>\n",
              "      <td>11971400</td>\n",
              "      <td>Changing colors of shapes in HTML5 canvas</td>\n",
              "      <td>javascript|html|canvas|polygon</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>5433772</td>\n",
              "      <td>5433772</td>\n",
              "      <td>Where to look for DB file after update-database?</td>\n",
              "      <td>c#|.net|entity</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>8996304</td>\n",
              "      <td>8996304</td>\n",
              "      <td>Graddle missing transitive dependency</td>\n",
              "      <td>maven|gradle|transitive-dependency</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>7648213</td>\n",
              "      <td>7648213</td>\n",
              "      <td>laravel link does work but button does not</td>\n",
              "      <td>twitter-bootstrap|laravel</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>14123938</td>\n",
              "      <td>14123938</td>\n",
              "      <td>Elegant haskell case/error handling in sequent...</td>\n",
              "      <td>haskell</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   Unnamed: 0  ...                                tags\n",
              "0    11971400  ...      javascript|html|canvas|polygon\n",
              "1     5433772  ...                      c#|.net|entity\n",
              "2     8996304  ...  maven|gradle|transitive-dependency\n",
              "3     7648213  ...           twitter-bootstrap|laravel\n",
              "4    14123938  ...                             haskell\n",
              "\n",
              "[5 rows x 4 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FRqvd1muijVE"
      },
      "source": [
        "import torch"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qjtjK2tH6hdn"
      },
      "source": [
        "class QueryDataset(Dataset):\n",
        "  def __init__(self, filename, kind):\n",
        "    self.titles = df_train['title']\n",
        "    self.labels = df_train['tags']\n",
        "\n",
        "  def __len__(self):\n",
        "    return self.titles.shape[0]\n",
        "\n",
        "  def __getitem__(self, i):\n",
        "    # now the cool bit - read without loading the whole thing in memory!\n",
        "    title = self.titles[i]\n",
        "    label = self.labels[i].astype('bool')\n",
        "    label = 1 if label else 0\n",
        "    # encoded = tokenizer(title, truncation=True, padding=True)\n",
        "    out = {'title': title, 'label': label}\n",
        "    return out"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "c9OPoW4uivuu"
      },
      "source": [
        "def collate_fn(data):\n",
        "  titles, labels = [v['title'] for v in data], [v['label'] for v in data]\n",
        "  encoded = tokenizer(titles, truncation=True, padding=True)\n",
        "  # for k,v in encoded.items():\n",
        "  #   print(len(v[0]))\n",
        "  out = {k: torch.tensor(v) for k,v in encoded.items()}\n",
        "  out['labels'] = torch.tensor(labels)\n",
        "  return out"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "DxL9pAGSXyJZ",
        "outputId": "4e8fa188-eed7-454f-8e20-914152bfbb27"
      },
      "source": [
        "trainset = QueryDataset(h5_train_path, 'train')\n",
        "trainloader = DataLoader(trainset, batch_size=256, num_workers=2, shuffle=True,\n",
        "                        collate_fn=collate_fn) # This seemingly redundant collate_fn param actually helps avoid a RuntimeError - https://github.com/pytorch/pytorch/issues/42654#issuecomment-706926806\n",
        "for i, y in enumerate(trainloader):\n",
        "  print(y)\n",
        "  break"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "{'input_ids': tensor([[  101, 24357,  3746,  ...,     0,     0,     0],\n",
            "        [  101, 10463,  5164,  ...,     0,     0,     0],\n",
            "        [  101,  2129,  2000,  ...,     0,     0,     0],\n",
            "        ...,\n",
            "        [  101,  9585,  8011,  ...,     0,     0,     0],\n",
            "        [  101,  2054,  2024,  ...,     0,     0,     0],\n",
            "        [  101,  2129,  2064,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],\n",
            "        [1, 1, 1,  ..., 0, 0, 0],\n",
            "        [1, 1, 1,  ..., 0, 0, 0],\n",
            "        ...,\n",
            "        [1, 1, 1,  ..., 0, 0, 0],\n",
            "        [1, 1, 1,  ..., 0, 0, 0],\n",
            "        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,\n",
            "        0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,\n",
            "        1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,\n",
            "        1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,\n",
            "        1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,\n",
            "        0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,\n",
            "        1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,\n",
            "        1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,\n",
            "        1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,\n",
            "        1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,\n",
            "        1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1])}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2nhCSTDlyC5m"
      },
      "source": [
        "## Model training code"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2z7cHC1xyAZr"
      },
      "source": [
        "from transformers import DistilBertForSequenceClassification, AdamW"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HmD6NW1FyZjM",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 237,
          "referenced_widgets": [
            "7a54279586bb4c87a52746f605f7b3e2",
            "6a7acabfc3f24d3395c0cc7cde68a43a",
            "b9c6899ed3dd489d85b201b51a6f35c3",
            "b413ba22912c4149aeea6824ccbc90c1",
            "6151969c55174665964db6f97bd18884",
            "f97dbff45ea347f380b3d3e2bf9b60a5",
            "6bc47beb071c4884a4ddb8a7dd669222",
            "6acde23812d44176829ce106ea626f09",
            "6e48979f9d49495da15e278aafd97a8b",
            "73a609c59da744088c8def134a0acebc",
            "3e978968f8814d4e9cb0cce52fe25f53",
            "5b91aa38af194763b33168e24eb80b91",
            "27cc2164a06a40a0b6e093bd1f642fce",
            "1b013fd1c8ce45c992ed4dba0bcf2393",
            "daedb950ac444ef3aa764b29f3e84052",
            "579a1b0eff7b425a9f22a18506fb7033"
          ]
        },
        "outputId": "7e390c52-f065-460c-dd08-8af04ee0af4e"
      },
      "source": [
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')\n",
        "model.to(device)\n",
        "model.train()\n",
        "\n",
        "optim = AdamW(model.parameters(), lr=5e-5)\n",
        "\n",
        "for epoch in range(3):\n",
        "  for batch in trainloader:\n",
        "    optim.zero_grad()\n",
        "    input_ids = batch['input_ids'].to(device)\n",
        "    attention_mask = batch['attention_mask'].to(device)\n",
        "    labels = batch['labels'].to(device)\n",
        "    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
        "    loss = outputs[0]\n",
        "    loss.backward()\n",
        "    optim.step()\n",
        "\n",
        "model.eval()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "7a54279586bb4c87a52746f605f7b3e2",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "6e48979f9d49495da15e278aafd97a8b",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
            "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mr_F0fiyzuy8"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}