{ "cells": [ { "cell_type": "markdown", "source": [ "**Install All the Required Libraries**" ], "metadata": { "id": "J-_xGH7WddfC" } }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9P4RR3jOYP9X", "outputId": "d3409611-0f95-496e-ee05-f6ea398a7709" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/542.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.4/542.0 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m542.0/542.0 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m171.5/171.5 kB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.5/85.5 MB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.2/401.2 kB\u001b[0m \u001b[31m34.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "# Install the 'datasets' library to load data from Hugging Face into the Google Colab Notebook.\n", "# Install the 'transformers' library to utilize the AutoTokenizer for converting raw text into tokens.\n", "# Install the 'sentence_transformers' library to acquire the embedding model.\n", "# Install 'faiss-gpu' for efficient similarity searches with dense vectors.\n", "!pip install -q datasets transformers sentence_transformers faiss-gpu\n" ] }, { "cell_type": "markdown", "source": [ "**Set the Environment to Use Hugging Face Token:** Ensure that the environment is configured to authenticate using your Hugging Face API token." ], "metadata": { "id": "B15TmeUZdooj" } }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9fz0LPTnZEHu", "outputId": "b81fca08-dfe1-4d6f-859b-babc77be3123" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Token successfully set.\n" ] } ], "source": [ "import os\n", "from IPython.display import clear_output\n", "from getpass import getpass\n", "\n", "# Prompt for the Hugging Face token securely\n", "hf_token = getpass(\"Please enter your Hugging Face token: \")\n", "os.environ[\"HF_TOKEN\"] = hf_token\n", "clear_output(wait=True)\n", "print(\"Token successfully set.\")\n" ] }, { "cell_type": "markdown", "source": [ "**Load the Dataset**" ], "metadata": { "id": "JQgXmGCldv7q" } }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 341, "referenced_widgets": [ "2847e9b5bf324ca0820f00da03eaeb11", "9f34117172f641bdae2c9eb822d1219a", "1b2d3141feb84005bcdf8bcb8329655d", "c8611415d5124c10b9537abfed7363db", "bd11c2d8bea8452d8e9e7dbaa385461d", "df5eb7f3eda9416086f2366748597cee", "73f4c2c2cea849f7a36df0d86b80debf", "2451c446566d46ce9b39afd89b62c44a", "921eb9a8c82d4a868488478e651faafb", "6347af2fb7bc40ce865f5b0e1b577ccb", "cae40c1dfb3441c5930214eb1548a746", "7cd7bfcb470440bc9458e0fb1807d1cd", "359e70118ebc4440ae4bca80aed91a4a", "f986aa1f103441a1af452b0226a7f52a", "9b4fc15b935f46dcb635feb978e74d4f", "f123b70067d8430585986b3681c48390", "be78b05a31f640c6b056448e5d9766de", "70626c5befbb463a9bbda3da59978df9", "a115a266b66043839a9ecf9420d4c5b2", "596f52bad5804282a3bd4fe3a168a797", "60d36f5c5f4541e0a110a9a9465ef597", "e6ee92b1b42b4ab48364c2412122ba1a", "589e85335fa44345a7b3dd751344465e", "fd2d0b59b37d42f1b35c42d8926fc5df", "01f9d4a160874f5ba9d69ce784e39fa9", "e550745f71c044d89f0c05f75a11a5e2", "4b557092255241b999c9983f64dd42ba", "731c30a7520045a2ab0c5889d651ef89", "ab0084b13d3d40a68f4b3510c3038a7f", "4a99065cea7b462180963a38695fc00f", "eb943879e1e841b1bfb16b1cb499e9df", "ed5e59b629aa4082ba6875c3882e6ba1", "29298a48c590451d8e10fb45deb6b853" ] }, "id": "u_ma7FfGZMBC", "outputId": "889f08e3-74ba-491c-ded9-fbf14859a3af" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading readme: 0%| | 0.00/5.34k [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "2847e9b5bf324ca0820f00da03eaeb11" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading data: 0%| | 0.00/15.6M [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "7cd7bfcb470440bc9458e0fb1807d1cd" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Generating train split: 0%| | 0/24926 [00:00, ? examples/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "589e85335fa44345a7b3dd751344465e" } }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['input', 'output', 'instruction', 'data_source'],\n", " num_rows: 24926\n", " })\n", "})" ] }, "metadata": {}, "execution_count": 3 } ], "source": [ "from datasets import load_dataset\n", "dataset = load_dataset(\"garage-bAInd/Open-Platypus\")\n", "dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BrLEFI8CZMlx", "outputId": "8d79d0b3-4d9d-4a56-d5d7-0f3de4fde4ae" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Dataset({\n", " features: ['input', 'output', 'instruction', 'data_source'],\n", " num_rows: 24926\n", "})" ] }, "metadata": {}, "execution_count": 4 } ], "source": [ "dataset[\"train\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "MVuiD_w5ZOiQ", "outputId": "70ef8d32-008f-4471-f9bf-691e24687d9e" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " input output \\\n", "0 To find the probability of the spinner landing... \n", "1 I need to choose 6 people out of 14, and the o... \n", "2 First we count the number of all 4-letter word... \n", "3 She can do this if and only if at least one of... \n", "4 Think of the problem as a sequence of H's and ... \n", "... ... ... \n", "24921 There are several formulas that generate all p... \n", "24922 Sure, here are some of the best universities:\\... \n", "24923 [Enter stage left]\\n\\nFirst Citizen: Good morr... \n", "24924 That's a very thoughtful idea! Here are some s... \n", "24925 Certainly! Here's an implementation of the Tim... \n", "\n", " instruction data_source \n", "0 A board game spinner is divided into three par... MATH/PRM-800K \n", "1 My school's math club has 6 boys and 8 girls. ... MATH/PRM-800K \n", "2 How many 4-letter words with at least one cons... MATH/PRM-800K \n", "3 Melinda will roll two standard six-sided dice ... MATH/PRM-800K \n", "4 Let $p$ be the probability that, in the proces... MATH/PRM-800K \n", "... ... ... \n", "24921 Can we find a formula to generate all prime nu... guanaco \n", "24922 What are some of the best university's for stu... guanaco \n", "24923 Write me a SPL (Shakespeare Programming Langua... guanaco \n", "24924 Hi. I want to make an embossed picture for my ... guanaco \n", "24925 Please implement the Timsort algorithm on Lean... guanaco \n", "\n", "[24926 rows x 4 columns]" ], "text/html": [ "\n", "
\n", " | input | \n", "output | \n", "instruction | \n", "data_source | \n", "
---|---|---|---|---|
0 | \n", "\n", " | To find the probability of the spinner landing... | \n", "A board game spinner is divided into three par... | \n", "MATH/PRM-800K | \n", "
1 | \n", "\n", " | I need to choose 6 people out of 14, and the o... | \n", "My school's math club has 6 boys and 8 girls. ... | \n", "MATH/PRM-800K | \n", "
2 | \n", "\n", " | First we count the number of all 4-letter word... | \n", "How many 4-letter words with at least one cons... | \n", "MATH/PRM-800K | \n", "
3 | \n", "\n", " | She can do this if and only if at least one of... | \n", "Melinda will roll two standard six-sided dice ... | \n", "MATH/PRM-800K | \n", "
4 | \n", "\n", " | Think of the problem as a sequence of H's and ... | \n", "Let $p$ be the probability that, in the proces... | \n", "MATH/PRM-800K | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
24921 | \n", "\n", " | There are several formulas that generate all p... | \n", "Can we find a formula to generate all prime nu... | \n", "guanaco | \n", "
24922 | \n", "\n", " | Sure, here are some of the best universities:\\... | \n", "What are some of the best university's for stu... | \n", "guanaco | \n", "
24923 | \n", "\n", " | [Enter stage left]\\n\\nFirst Citizen: Good morr... | \n", "Write me a SPL (Shakespeare Programming Langua... | \n", "guanaco | \n", "
24924 | \n", "\n", " | That's a very thoughtful idea! Here are some s... | \n", "Hi. I want to make an embossed picture for my ... | \n", "guanaco | \n", "
24925 | \n", "\n", " | Certainly! Here's an implementation of the Tim... | \n", "Please implement the Timsort algorithm on Lean... | \n", "guanaco | \n", "
24926 rows × 4 columns
\n", "\n", " | input | \n", "output | \n", "instruction | \n", "data_source | \n", "
---|---|---|---|---|
0 | \n", "\n", " | To find the probability of the spinner landing... | \n", "A board game spinner is divided into three par... | \n", "MATH/PRM-800K | \n", "
1 | \n", "\n", " | I need to choose 6 people out of 14, and the o... | \n", "My school's math club has 6 boys and 8 girls. ... | \n", "MATH/PRM-800K | \n", "
2 | \n", "\n", " | First we count the number of all 4-letter word... | \n", "How many 4-letter words with at least one cons... | \n", "MATH/PRM-800K | \n", "
3 | \n", "\n", " | She can do this if and only if at least one of... | \n", "Melinda will roll two standard six-sided dice ... | \n", "MATH/PRM-800K | \n", "
4 | \n", "\n", " | Think of the problem as a sequence of H's and ... | \n", "Let $p$ be the probability that, in the proces... | \n", "MATH/PRM-800K | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
24890 | \n", "\n", " | There are several formulas that generate all p... | \n", "Can we find a formula to generate all prime nu... | \n", "guanaco | \n", "
24891 | \n", "\n", " | Sure, here are some of the best universities:\\... | \n", "What are some of the best university's for stu... | \n", "guanaco | \n", "
24892 | \n", "\n", " | [Enter stage left]\\n\\nFirst Citizen: Good morr... | \n", "Write me a SPL (Shakespeare Programming Langua... | \n", "guanaco | \n", "
24893 | \n", "\n", " | That's a very thoughtful idea! Here are some s... | \n", "Hi. I want to make an embossed picture for my ... | \n", "guanaco | \n", "
24894 | \n", "\n", " | Certainly! Here's an implementation of the Tim... | \n", "Please implement the Timsort algorithm on Lean... | \n", "guanaco | \n", "
24895 rows × 4 columns
\n", "\n", " | input | \n", "output | \n", "instruction | \n", "data_source | \n", "
---|---|---|---|---|
0 | \n", "\n", " | To find the probability of the spinner landing... | \n", "A board game spinner is divided into three par... | \n", "MATH/PRM-800K | \n", "
1 | \n", "\n", " | I need to choose 6 people out of 14, and the o... | \n", "My school's math club has 6 boys and 8 girls. ... | \n", "MATH/PRM-800K | \n", "
2 | \n", "\n", " | First we count the number of all 4-letter word... | \n", "How many 4-letter words with at least one cons... | \n", "MATH/PRM-800K | \n", "
3 | \n", "\n", " | She can do this if and only if at least one of... | \n", "Melinda will roll two standard six-sided dice ... | \n", "MATH/PRM-800K | \n", "
4 | \n", "\n", " | Think of the problem as a sequence of H's and ... | \n", "Let $p$ be the probability that, in the proces... | \n", "MATH/PRM-800K | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
18183 | \n", "\n", " | There are several formulas that generate all p... | \n", "Can we find a formula to generate all prime nu... | \n", "guanaco | \n", "
18184 | \n", "\n", " | Sure, here are some of the best universities:\\... | \n", "What are some of the best university's for stu... | \n", "guanaco | \n", "
18185 | \n", "\n", " | [Enter stage left]\\n\\nFirst Citizen: Good morr... | \n", "Write me a SPL (Shakespeare Programming Langua... | \n", "guanaco | \n", "
18186 | \n", "\n", " | That's a very thoughtful idea! Here are some s... | \n", "Hi. I want to make an embossed picture for my ... | \n", "guanaco | \n", "
18187 | \n", "\n", " | Certainly! Here's an implementation of the Tim... | \n", "Please implement the Timsort algorithm on Lean... | \n", "guanaco | \n", "
18188 rows × 4 columns
\n", "\n", " | instruction | \n", "output | \n", "
---|---|---|
0 | \n", "Let's come up with a rich and complex alien wo... | \n", "Planet Name: Xylothar\\n\\nXylothar is a diverse... | \n", "
1 | \n", "Let\\n$$p(x,y) = a_0 + a_1x + a_2y + a_3x^2 + a... | \n", "Observe that \\begin{align*}\\np(0,0) &= a_0 = ... | \n", "
2 | \n", "Given the code below, refactor it, add comment... | \n", "Here is the refactored and commented version:\\... | \n", "
3 | \n", "Find the area of the region described by $x \\g... | \n", "Let $n = \\lfloor x \\rfloor,$ and let $\\{x\\} = ... | \n", "
4 | \n", "Let $P$ be the plane passing through the origi... | \n", "Let $\\mathbf{v} = \\begin{pmatrix} x \\\\ y \\\\ z ... | \n", "
... | \n", "... | \n", "... | \n", "
995 | \n", "Hello. My name is Mike. I have a message left ... | \n", "Hello Mike, it's nice to meet you. As an AI la... | \n", "
996 | \n", "Given a prime $p$ and an integer $a$, we say t... | \n", "To find the primitive roots $\\pmod 7$, I need ... | \n", "
997 | \n", "Let $f$ be defined by \\[f(x) = \\left\\{\\n\\begi... | \n", "The number $f^{-1}(-3)$ is the value of $x$ su... | \n", "
998 | \n", "BEGININPUT\\nBEGINCONTEXT\\ndate: August 5, 2021... | \n", "Dr. Eleanor Thompson's study found that partic... | \n", "
999 | \n", "BEGININPUT\\nBEGINCONTEXT\\ndate: August 14, 201... | \n", "Charles Leroux won the Golden Bread Award in 1... | \n", "
1000 rows × 2 columns
\n", "\n", " | instruction | \n", "output | \n", "
---|---|---|
0 | \n", "Let's come up with a rich and complex alien wo... | \n", "Planet Name: Xylothar\\n\\nXylothar is a diverse... | \n", "
1 | \n", "Let\\n$$p(x,y) = a_0 + a_1x + a_2y + a_3x^2 + a... | \n", "Observe that \\begin{align*}\\np(0,0) &= a_0 = ... | \n", "
2 | \n", "Given the code below, refactor it, add comment... | \n", "Here is the refactored and commented version:\\... | \n", "
3 | \n", "Find the area of the region described by $x \\g... | \n", "Let $n = \\lfloor x \\rfloor,$ and let $\\{x\\} = ... | \n", "
4 | \n", "Let $P$ be the plane passing through the origi... | \n", "Let $\\mathbf{v} = \\begin{pmatrix} x \\\\ y \\\\ z ... | \n", "
... | \n", "... | \n", "... | \n", "
995 | \n", "Hello. My name is Mike. I have a message left ... | \n", "Hello Mike, it's nice to meet you. As an AI la... | \n", "
996 | \n", "Given a prime $p$ and an integer $a$, we say t... | \n", "To find the primitive roots $\\pmod 7$, I need ... | \n", "
997 | \n", "Let $f$ be defined by \\[f(x) = \\left\\{\\n\\begi... | \n", "The number $f^{-1}(-3)$ is the value of $x$ su... | \n", "
998 | \n", "BEGININPUT\\nBEGINCONTEXT\\ndate: August 5, 2021... | \n", "Dr. Eleanor Thompson's study found that partic... | \n", "
999 | \n", "BEGININPUT\\nBEGINCONTEXT\\ndate: August 14, 201... | \n", "Charles Leroux won the Golden Bread Award in 1... | \n", "
1000 rows × 2 columns
\n", "\n", " | instruction | \n", "output | \n", "
---|---|---|
0 | \n", "### Instruction:\\nLet's come up with a rich an... | \n", "Planet Name: Xylothar\\n\\nXylothar is a diverse... | \n", "
1 | \n", "### Instruction:\\nLet\\n$$p(x,y) = a_0 + a_1x +... | \n", "Observe that \\begin{align*}\\np(0,0) &= a_0 = ... | \n", "
2 | \n", "### Instruction:\\nGiven the code below, refact... | \n", "Here is the refactored and commented version:\\... | \n", "
3 | \n", "### Instruction:\\nFind the area of the region ... | \n", "Let $n = \\lfloor x \\rfloor,$ and let $\\{x\\} = ... | \n", "
4 | \n", "### Instruction:\\nLet $P$ be the plane passing... | \n", "Let $\\mathbf{v} = \\begin{pmatrix} x \\\\ y \\\\ z ... | \n", "
... | \n", "... | \n", "... | \n", "
995 | \n", "### Instruction:\\nHello. My name is Mike. I ha... | \n", "Hello Mike, it's nice to meet you. As an AI la... | \n", "
996 | \n", "### Instruction:\\nGiven a prime $p$ and an int... | \n", "To find the primitive roots $\\pmod 7$, I need ... | \n", "
997 | \n", "### Instruction:\\nLet $f$ be defined by \\[f(x... | \n", "The number $f^{-1}(-3)$ is the value of $x$ su... | \n", "
998 | \n", "### Instruction:\\nBEGININPUT\\nBEGINCONTEXT\\nda... | \n", "Dr. Eleanor Thompson's study found that partic... | \n", "
999 | \n", "### Instruction:\\nBEGININPUT\\nBEGINCONTEXT\\nda... | \n", "Charles Leroux won the Golden Bread Award in 1... | \n", "
1000 rows × 2 columns
\n", "