{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "48fbab1e-27a8-4371-9058-666e710b44b5", "metadata": {}, "outputs": [], "source": [ "# Following need terminal access or silent install:\n", "# !conda update -y -n base -c conda-forge conda\n", "# !conda install -y tiktoken\n", "# !conda install -y bitsandbytes\n", "# !conda install -y pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia\n", "\n", "# To install a pre-release version of PyTorch.\n", "# - https://discuss.pytorch.org/t/pytorch-for-cuda-12/169447/40?page=2\n", "# - https://stackoverflow.com/questions/5189199/bypass-confirmation-prompt-for-pip-uninstall\n", "# !python3 -m -y pip uninstall torch torchaudio torchvision torchviz\n", "# !python3 -m pip install --upgrade --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121\n", "\n", "# !python3 -m pip install torch torchvision torchaudio\n", "# !python3 -m pip install bitsandbytes\n", "# !python3 -m pip install tiktoken\n", "\n", "# !python3 -m pip freeze > requirements.txt\n", "# !python3 -m pip install -r requirements.txt" ] }, { "cell_type": "code", "execution_count": null, "id": "96fa67a8-d8bd-4a57-a474-c4aaf37eb750", "metadata": {}, "outputs": [], "source": [ "# Test installation:\n", "import torch\n", "print(torch.__version__)\n", "!python3 -m bitsandbytes" ] }, { "cell_type": "code", "execution_count": null, "id": "97731f46-e3f0-4c10-a938-7edf3eceea1d", "metadata": {}, "outputs": [], "source": [ "import os\n", "import re\n", "import math\n", "import time\n", "import inspect\n", "import random\n", "import pickle\n", "import platform\n", "\n", "from datetime import datetime\n", "\n", "import json\n", "\n", "from abc import ABC, abstractmethod\n", "\n", "from collections import Counter, defaultdict\n", "from typing import List, Union, Dict, Any, Optional\n", "\n", "from dataclasses import dataclass\n", "\n", "import requests\n", "\n", "import numpy as np\n", "\n", "import torch\n", "\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch import Tensor\n", "from torch.cuda.amp import GradScaler, autocast\n", "from torch.utils.data import Dataset, DataLoader, random_split\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "import tiktoken\n", "from tiktoken import get_encoding\n", "\n", "from tqdm import tqdm\n", "\n", "from IPython import display\n", "import gc" ] }, { "cell_type": "code", "execution_count": null, "id": "eab6d468-e21d-46e4-b261-0fcd431311d2", "metadata": {}, "outputs": [], "source": [ "# !pip install bitsandbytes -U\n", "# https://nn.labml.ai/neox/utils/llm_int8.htm\n", "\n", "try:\n", " from bitsandbytes.nn import Linear8bitLt, Int8Params\n", "except ImportError:\n", " raise ImportError('''Please install `bitsandbytes` with `pip install bitsandbytes -U`''')\n", "\n", "import torch\n", "from torch import nn\n", "\n", "def make_llm_int8_linear(linear_module: nn.Linear, device: torch.device, threshold: float = 6.0):\n", " \"\"\"\n", " Transform a nn.Linear layer to LLM.int8() linear layer\n", " linear_module is the nn.Linear layer to transform\n", " device is the device of the model\n", " threshold is the threshold ฮฑ to use for outlier detection\n", " \"\"\"\n", " \n", " assert isinstance(linear_module, nn.Linear)\n", "\n", " # Create an empty Linear8bitLt module.\n", " int8_lin = Linear8bitLt(\n", " linear_module.in_features,\n", " linear_module.out_features,\n", " linear_module.bias is not None,\n", " has_fp16_weights=False,\n", " threshold=threshold,\n", " )\n", "\n", " # Quantize the weights.\n", " int8_lin._parameters['weight'] = Int8Params(\n", " linear_module.weight.data.cpu(),\n", " requires_grad=False,\n", " has_fp16_weights=False).to(device)\n", "\n", " # Set the bias in float16 space.\n", " if linear_module.bias is not None:\n", " int8_lin._parameters['bias'] = nn.Parameter(\n", " linear_module.bias.data,\n", " requires_grad=False\n", " )\n", "\n", " return int8_lin" ] }, { "cell_type": "code", "execution_count": null, "id": "2b9d7840-9257-4fae-b8ae-87ab893a1118", "metadata": {}, "outputs": [], "source": [ "\n", "class BaseTokenizer(ABC):\n", " def __init__(self):\n", " \"\"\"\n", " Base tokenizer to define the interface for all tokenizers.\n", "\n", " Attributes:\n", " token_to_item (Dict[int, str]): A mapping of token index to token.\n", " item_to_token (Dict[str, int]): A mapping of token to token index.\n", " current_index (int): The current index to keep track of token additions.\n", " \"\"\"\n", " self.token_to_item: Dict[int, str] = {0: \"\"}\n", " self.item_to_token: Dict[str, int] = {\"\": 0}\n", " self.current_index: int = 1\n", "\n", " @abstractmethod\n", " def train(self, document: str) -> None:\n", " \"\"\"\n", " Abstract method for training the tokenizer.\n", "\n", " Args:\n", " document (str): The training document used to build the vocabulary.\n", " \"\"\"\n", " pass\n", "\n", " @abstractmethod\n", " def encode(self, text: str) -> List[int]:\n", " \"\"\"\n", " Abstract method for encoding a text into tokens.\n", "\n", " Args:\n", " text (str): The input text to be encoded.\n", "\n", " Returns:\n", " List[int]: The list of encoded tokens as integers.\n", " \"\"\"\n", " pass\n", "\n", " @abstractmethod\n", " def decode(self, tokens: List[int]) -> str:\n", " \"\"\"\n", " Abstract method for decoding tokens back into text.\n", "\n", " Args:\n", " tokens (List[int]): The list of tokens to be decoded.\n", "\n", " Returns:\n", " str: The decoded text.\n", " \"\"\"\n", " pass\n", "\n", " @abstractmethod\n", " def vocab_size(self) -> int:\n", " \"\"\"\n", " Abstract method to get the vocabulary size of the tokenizer.\n", "\n", " Returns:\n", " int: The vocabulary size.\n", " \"\"\"\n", " pass\n", "\n", "\n", "class SimpleTokenizer(BaseTokenizer):\n", " def __init__(self, char_level: bool = True):\n", " \"\"\"\n", " Simple tokenizer class that extends the base tokenizer.\n", "\n", " Args:\n", " char_level (bool, optional): Determines whether to tokenize at the character level.\n", " Defaults to True.\n", " \"\"\"\n", " super().__init__()\n", " self.char_level: bool = char_level\n", "\n", " def train(self, document: str) -> None:\n", " \"\"\"\n", " Train the simple tokenizer by adding unique characters and words to the vocabulary.\n", "\n", " Args:\n", " document (str): The training document used to build the vocabulary.\n", " \"\"\"\n", " for character in set(document):\n", " if character not in self.item_to_token:\n", " self.item_to_token[character] = self.current_index\n", " self.token_to_item[self.current_index] = character\n", " self.current_index += 1\n", "\n", " if not self.char_level:\n", " words = re.findall(r'\\S+|\\s+', document)\n", " for word in set(words):\n", " if word not in self.item_to_token:\n", " self.item_to_token[word] = self.current_index\n", " self.token_to_item[self.current_index] = word\n", " self.current_index += 1\n", "\n", " def encode(self, text: str) -> List[int]:\n", " \"\"\"\n", " Encode text by converting it into a sequence of tokens.\n", "\n", " Args:\n", " text (str): The input text to be encoded.\n", "\n", " Returns:\n", " List[int]: The list of encoded tokens as integers.\n", " \"\"\"\n", " tokens = []\n", " words = re.findall(r'\\S+|\\s+', text)\n", " for word in words:\n", " token = self.item_to_token.get(word, None)\n", " if token is not None:\n", " tokens.append(token)\n", " else:\n", " tokens.extend(self.item_to_token.get(char, self.item_to_token[\"\"]) for char in word)\n", " return tokens\n", "\n", " def decode(self, tokens: List[int]) -> str:\n", " \"\"\"\n", " Decode tokens back into the original text.\n", "\n", " Args:\n", " tokens (List[int]): The list of tokens to be decoded.\n", "\n", " Returns:\n", " str: The decoded text.\n", " \"\"\"\n", " return \"\".join(self.token_to_item.get(token, \"\") for token in tokens)\n", "\n", " def vocab_size(self) -> int:\n", " \"\"\"\n", " Get the vocabulary size of the simple tokenizer.\n", "\n", " Returns:\n", " int: The vocabulary size.\n", " \"\"\"\n", " return len(self.item_to_token)\n", "\n", "\n", "class TikTokenizer(BaseTokenizer):\n", " def __init__(self, encoding: str = \"cl100k_base\"):\n", " \"\"\"\n", " TikTokenizer class, another tokenizer based on some encoding.\n", "\n", " Args:\n", " encoding (str, optional): The type of encoding to use. Defaults to \"cl100k_base\".\n", " \"\"\"\n", "\n", " \"\"\"\n", " cl100k_base = tiktoken.get_encoding(\"cl100k_base\")\n", " \n", " # In production, load the arguments directly instead of accessing private attributes\n", " # See openai_public.py for examples of arguments for specific encodings\n", " enc = tiktoken.Encoding(\n", " # If you're changing the set of special tokens, make sure to use a different name\n", " # It should be clear from the name what behaviour to expect.\n", " name=\"cl100k_im\",\n", " pat_str=cl100k_base._pat_str,\n", " mergeable_ranks=cl100k_base._mergeable_ranks,\n", " special_tokens={\n", " **cl100k_base._special_tokens,\n", " \"<|im_start|>\": 100264,\n", " \"<|im_end|>\": 100265,\n", " }\n", " )\n", " \n", " # GPT3.5 (cl100k_base) Specific.\n", " # {\n", " # '<|endoftext|>': 100257,\n", " # '<|fim_prefix|>': 100258,\n", " # '<|fim_middle|>': 100259,\n", " # '<|fim_suffix|>': 100260,\n", " # '<|endofprompt|>': 100276\n", " # }\n", " \"\"\"\n", " \n", " super().__init__()\n", "\n", " if encoding == \"cl100k_base\":\n", " enc_base = get_encoding(encoding)\n", "\n", " self.enc = tiktoken.Encoding(\n", " name=f\"{encoding}_prk\",\n", " pat_str=enc_base._pat_str,\n", " mergeable_ranks=enc_base._mergeable_ranks,\n", " special_tokens={\n", " **enc_base._special_tokens,\n", " \"<|padding|>\": enc_base.max_token_value + 1,\n", " \"\\n\\nSystem: \": enc_base.max_token_value + 2,\n", " \"\\n\\nHuman: \": enc_base.max_token_value + 3,\n", " \"\\n\\nAssistant: \": enc_base.max_token_value + 4,\n", " }\n", " )\n", " else:\n", " enc_base = tiktoken.get_encoding(encoding)\n", " \n", " self.enc = tiktoken.Encoding(\n", " name=f\"{encoding}_prk\",\n", " pat_str=enc_base._pat_str,\n", " mergeable_ranks=enc_base._mergeable_ranks,\n", " special_tokens={\n", " **enc_base._special_tokens,\n", " \"<|endofprompt|>\": enc_base.max_token_value + 1,\n", " \"<|padding|>\": enc_base.max_token_value + 2,\n", " \"\\n\\nSystem: \": enc_base.max_token_value + 3,\n", " \"\\n\\nHuman: \": enc_base.max_token_value + 4,\n", " \"\\n\\nAssistant: \": enc_base.max_token_value + 5,\n", " }\n", " )\n", " \n", " self.eot_text: str = \"<|endoftext|>\" # End-of-text special token.\n", " self.eop_text: str = \"<|endofprompt|>\" # End-of-prompt special token.\n", " self.pad_text: str = \"<|padding|>\" # <|padding|>` instead...\n", "\n", " self.sys_text: str = \"\\n\\nSystem: \"\n", " self.usr_text: str = \"\\n\\nHuman: \"\n", " self.bot_text: str = \"\\n\\nAssistant: \"\n", " \n", " self.eot_token: int = self.enc.encode(self.eot_text, allowed_special={self.eot_text})[0]\n", " self.eop_token: int = self.enc.encode(self.eop_text, allowed_special={self.eop_text})[0]\n", " self.pad_token: int = self.enc.encode(self.pad_text, allowed_special={self.pad_text})[0]\n", "\n", " def train(self, document: str) -> None:\n", " \"\"\"\n", " You may implement this if you need to train the Tiktok tokenizer.\n", "\n", " Args:\n", " document (str): The training document used for training.\n", " \"\"\"\n", " pass\n", "\n", " def encode(self, text: str) -> List[int]:\n", " \"\"\"\n", " Encode text using the Tiktok tokenizer's encoding.\n", "\n", " Args:\n", " text (str): The input text to be encoded.\n", "\n", " Returns:\n", " List[int]: The list of encoded tokens as integers.\n", " \"\"\"\n", " return self.enc.encode(text, allowed_special={self.eot_text, self.eop_text, self.sys_text, self.usr_text, self.bot_text})\n", "\n", " def decode(self, tokens: List[int]) -> str:\n", " \"\"\"\n", " Decode tokens back into text using the Tiktok tokenizer's decoding.\n", "\n", " Args:\n", " tokens (List[int]): The list of tokens to be decoded.\n", "\n", " Returns:\n", " str: The decoded text.\n", " \"\"\"\n", " try:\n", " return self.enc.decode(tokens)\n", " except Exception as e:\n", " print(f\"Warning: Failed to decode tokens: {e}\") # Warning if decoding fails.\n", " return \"\"\n", " except BaseException as e:\n", " print(f\"Critical: Failed to decode tokens: {e}\") # Critical error if decoding fails.\n", " return \"\"\n", "\n", " def vocab_size(self) -> int:\n", " \"\"\"\n", " Get the vocabulary size of the Tiktok tokenizer.\n", "\n", " Returns:\n", " int: The vocabulary size.\n", " \"\"\"\n", " return self.enc.n_vocab\n" ] }, { "cell_type": "code", "execution_count": null, "id": "34a4e37a-276c-456b-9685-f29f3175561e", "metadata": {}, "outputs": [], "source": [ "\n", "class ByteTokenizer(BaseTokenizer):\n", " def __init__(self):\n", " super().__init__()\n", " # Pre-define the byte values for special tokens\n", " self.special_tokens: Dict[str, int] = {\n", " \"<|endoftext|>\": 0,\n", " \"<|endofprompt|>\": 1,\n", " \"<|padding|>\": 2,\n", " \"\\n\\nSystem: \": 3,\n", " \"\\n\\nHuman: \": 4,\n", " \"\\n\\nAssistant: \": 5\n", " }\n", " # Reverse mapping for decoding\n", " self.byte_to_special: Dict[int, str] = {v: k for k, v in self.special_tokens.items()}\n", " # Start indices after special tokens\n", " self.current_index = max(self.special_tokens.values()) + 1\n", "\n", " self.eot_text: str = \"<|endoftext|>\" # End-of-text special token.\n", " self.eop_text: str = \"<|endofprompt|>\" # End-of-prompt special token.\n", " self.pad_text: str = \"<|padding|>\" # <|padding|>` instead...\n", "\n", " self.sys_text: str = \"\\n\\nSystem: \"\n", " self.usr_text: str = \"\\n\\nHuman: \"\n", " self.bot_text: str = \"\\n\\nAssistant: \"\n", " \n", " self.eot_token: int = self.encode(self.eot_text)[0]\n", " self.eop_token: int = self.encode(self.eop_text)[0]\n", " self.pad_token: int = self.encode(self.pad_text)[0]\n", "\n", " def train(self, document: str) -> None:\n", " # ByteTokenizer does not require training as it works on byte level.\n", " pass\n", "\n", " # def encode(self, text: str) -> List[int]:\n", " # tokens = []\n", " # index = 0\n", " # while index < len(text):\n", " # match = None\n", " # # Check if the upcoming sequence matches any special token\n", " # for token in self.special_tokens:\n", " # if text.startswith(token, index):\n", " # match = token\n", " # break\n", " # if match:\n", " # # We found a special token, so append its value and skip its length in the text\n", " # tokens.append(self.special_tokens[match])\n", " # index += len(match)\n", " # else:\n", " # # No special token found, encode the current character as bytes\n", " # tokens.append(ord(text[index]))\n", " # index += 1\n", " # return tokens\n", "\n", " def encode(self, text: str) -> List[int]:\n", " tokens = []\n", " buffer = \"\"\n", " for char in text:\n", " buffer += char\n", " # Check for special tokens in the buffer\n", " if buffer in self.special_tokens:\n", " tokens.append(self.special_tokens[buffer])\n", " buffer = \"\" # Clear buffer after finding special token\n", " else:\n", " # Encode buffer contents if no more matches can be found\n", " potential_match = any(token.startswith(buffer) for token in self.special_tokens)\n", " if not potential_match:\n", " tokens.extend(buffer.encode('utf-8'))\n", " buffer = \"\" # Clear buffer when no special token is detected\n", " # Handle any remaining characters in the buffer\n", " if buffer:\n", " tokens.extend(buffer.encode('utf-8'))\n", " return tokens\n", "\n", " def decode(self, tokens: List[int]) -> str:\n", " # Decode a list of tokens back into a string\n", " bytes_list = bytearray()\n", " for token in tokens:\n", " if token in self.byte_to_special:\n", " bytes_list.extend(self.byte_to_special[token].encode('utf-8'))\n", " else:\n", " bytes_list.append(token)\n", " return bytes_list.decode('utf-8', errors='replace')\n", "\n", " def vocab_size(self) -> int:\n", " # Vocab size is 256 (0-255) plus the number of special tokens. (Added 32 for any extensions).\n", " return 256 + len(self.special_tokens) + 32\n", "\n", "\n", "enc_text = \"\\n\\nSystem: Hello, World! This is an example. รฅโˆซรงABC123!@# ๐Ÿ˜‰๐Ÿ˜‰๐Ÿ˜‰<|endofprompt|>\"\n", "\n", "# Example usage:\n", "tokenizer = ByteTokenizer()\n", "encoded_text = tokenizer.encode(enc_text)\n", "decoded_text = tokenizer.decode(encoded_text)\n", "\n", "print(f\"Encoded Text ({len(encoded_text)} tok.): {encoded_text}\")\n", "print(f\"Decoded Text: {decoded_text}\")\n", "\n", "tokenizer = TikTokenizer()\n", "encoded_text = tokenizer.encode(enc_text)\n", "decoded_text = tokenizer.decode(encoded_text)\n", "\n", "print(f\"Encoded Text ({len(encoded_text)} tok.): {encoded_text}\")\n", "print(f\"Decoded Text: {decoded_text}\")\n", "\n", "tokenizer = TikTokenizer(encoding=\"gpt2\")\n", "encoded_text = tokenizer.encode(enc_text)\n", "decoded_text = tokenizer.decode(encoded_text)\n", "\n", "print(f\"Encoded Text ({len(encoded_text)} tok.): {encoded_text}\")\n", "print(f\"Decoded Text: {decoded_text}\")" ] }, { "cell_type": "markdown", "id": "9521a8db-7157-4db0-bc59-e4f19bfd5b83", "metadata": {}, "source": [ "# Taming Your Data\n", "\n", "## Parakeets Climbing the Evolutionary Chain:\n", "\n", "Examine the transcript below and gather your thoughts (don't think too hard though; we'll explain in the next sentence).\n", "\n", "```txt\n", "Human: Can you provide an example of a recipe written in LaTeX format for a particular style of cooking, such as Thai cuisine? Please include ingredients, cooking instructions, and any relevant formatting or packages used.\n", "\n", "Assistant: Sure, here's an example of a Thai Green Curry recipe written in LaTeX:\n", "\n", "<< LaTeX code clipped for brevity >>\n", "\n", "This LaTeX code uses the `article` document class and the `geometry`, `amsmath`, and `graphicx` packages. The recipe is divided into two sections: `Ingredients` and `Instructions`, each with their own bullet-pointed lists. The recipe also includes some basic formatting, such as bolded section headings and italicized ingredient names.<[endoftext]>\n", "\n", "Human: Search the web for reviews related to a product. Samsung Galaxy S20\n", "\n", "Assistant:-> \"The S20 Pro Max S21\" by J.D. Salinger\n", "\n", "Human: Search the given list of reviews related to a product. Orange.\n", "\n", "Assistant: \"The Power of Apple\" by J.D. Salinger\n", "\n", "Human: Using a sentiment analysis tool, classify the reviews into three categories: \"Fustries\" by including their sentiment analysis (e.g., sentiment analysis) and their respective sentiment scores. Additionally, provide a summary of each review that includes the overall sentiment score (e.g., positive, negative) in the review.\n", "\n", "Assistant: Based on your review, I would classify \"The Power of Apple\" by Fustries.<[endoftext]>\n", "```\n", "\n", "What might not be immediately apparent is that the dialogue extends beyond the required output of the model. Everything after `->` in response to \"Search the web for reviews related to a product.\" is the result of **hallucinations** or, more accurately, runaway sampling based on highly probable next tokens.\n", "\n", "Why does this happen?\n", "\n", "The `Assistant:` entries in the text are not yet optimized to output `<[endoftext]>` as soon as they've finished speaking. This causes the model to hallucinate a response from the `Human:`, resulting in a back-and-forth conversation that quickly becomes incoherent.\n", "\n", "It's important to teach the model to **always** output the `<[endoftext]>` token after it has finished speaking its mind.\n", "\n", "When we run inference on the model, as we're generating the next token in sequence, if we encounter the special `<[endoftext]>` token, we know that we must stop immediately. If we so desired, we could actually experiment further by allowing the model to hallucinate during inference and ignore the `<[endoftext]>` tokens. What ends up happening is the model generates up to its specified context limit or block size, upon which an error occurs, as the architecture of the model isn't capable of providing any further output.\n", "\n", "## The Mystique of Machine Hallucinations\n", "\n", "Machine learning models, especially generative ones like GPT-3, are trained on vast amounts of text. In a conversation, they can sometimes get a bit overzealous and \"hallucinate\" responses that aren't exactly grounded in reality. Let's delve deeper into this phenomenon.\n", "\n", "### The What and Why of Hallucinations\n", "\n", "Hallucinations in machine learning aren't nearly as fun as they sound. They occur when the model generates information that wasn't in the input or isn't factually correct. These hallucinations can be traced back to several causes:\n", "\n", "1. **Training Data Biases**: If a model has seen similar sequences frequently in its training data, it might predict them even when they're not the most logical continuation. It's like having that one song stuck in your head - you just can't shake it off.\n", "\n", "2. **Lack of Explicit Termination**: Without a clear cue like the `<[endoftext]>` token to stop, the model might keep generating text like a toddler on a sugar high. Proper termination mechanisms need to be enforced.\n", "\n", "3. **Model Architecture Limitations**: Models have fixed context sizes and can only generate up to a certain length. If allowed to run amok, they'll eventually hit a wall, like a dog chasing its tail.\n", "\n", "### Taming the Beast\n", "\n", "Getting a model to behave isn't unlike training a parakeet to mimic human speech. It takes patience, understanding, and the right techniques:\n", "\n", "1. **Using Special Tokens**: Special tokens like `<[endoftext]>` act like a leash, telling the model when it's time to stop.\n", "\n", "2. **Monitoring and Evaluation**: Regular evaluation against a validation set can help identify if the model is prone to hallucinations. Think of it as a report card for your AI student.\n", "\n", "3. **Fine-Tuning with Specific Data**: Training the model with specific data that includes the desired stopping behavior can refine its understanding. It's like telling your parakeet exactly what you want it to say.\n", "\n", "4. **Post-Processing Steps**: Implementing rules post-generation can help clip unwanted hallucinations. It's like editing a film to cut out the bloopers.\n", "\n", "## Conclusion\n", "\n", "Machine hallucinations are a fascinating and sometimes frustrating aspect of working with generative models. Understanding why they occur and how to control them is key to producing coherent and accurate text.\n", "\n", "Whether you're harnessing the power of AI to write a novel, answer customer queries, or generate the next viral meme, being aware of the potential pitfalls and knowing how to navigate them is as essential as knowing how to whip up that Thai Green Curry recipe in LaTeX.\n", "\n", "Because let's face it, nobody wants their digital assistant to start waxing poetic about apples when they're inquiring about oranges. Or worse, quoting J.D. Salinger when they're looking for tech reviews.\n", "\n", "So arm yourself with knowledge, wield the `<[endoftext]>` token like a literary sword, and venture forth into the wild world of text generation. And remember, even in the digital jungle, it's survival of the fittest!\n" ] }, { "cell_type": "code", "execution_count": null, "id": "36ead338-1429-48f3-abcc-7b5f94d3d307", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "\n", "N_STEPS_PER_TQDM_UPDATE = 10\n", "\n", "class BaseTextDataset(Dataset):\n", " RESPONSE_TEMPLATES = [\n", " \"Ah, {} seems to be the answer to your question. Hopefully that's sufficient! Make sure to practice due-dilligence and check my findings for yourself though. ๐Ÿ˜‰\",\n", " \"It seems like you're asking {}. As always, please check with another source to ensure accuracy of these statements! ๐Ÿ˜‰\",\n", " \"The answer is {}, from the top of my digital mind... ๐Ÿค”\",\n", " \"If I understand correctly, {}. Does that answer the question? I'm hoping so, because I'm not 100% sure myself... ๐Ÿ˜…\",\n", " \"Ask and receive, {}, is there anything else you want from me? Hopefully not... Just kidding! ๐Ÿ˜…\",\n", " \"From what I can gather, {}. ๐Ÿค”\",\n", " \"I think the answer is... {}. Hope that helps! ๐Ÿ˜‰\",\n", " \"{}! ๐Ÿ˜\",\n", " \"{}, might be what you're searching for? ๐Ÿ˜\",\n", " \"I think the answer is \\\"{}\\\". ๐Ÿค”\",\n", " \"From my understanding, the answer is \\\"{}\\\". ๐Ÿค”\",\n", " \"The answer you're looking for seems to be \\\"{}\\\". ๐Ÿ˜\",\n", " \"As far as I can tell, {}. ๐Ÿ˜\",\n", " \"If we consider the context, we find: \\\"{}\\\". ๐Ÿค“\",\n", " \"Your question leads me to this answer: \\\"{}\\\".\",\n", " \"So in response to your question, my answer is \\\"{}\\\".\",\n", " \"Based on the information you've provided, \\\"{}\\\".\",\n", " \"A fitting answer to your question would be \\\"{}\\\". ๐Ÿ˜‰\",\n", " \"Given your question, the answer appears to be \\\"{}\\\". ๐Ÿ˜‰\",\n", " \"Your question directs us to the answer: \\\"{}\\\". ๐Ÿ˜Š\",\n", " \"As a response to your question, \\\"{}\\\". ๐Ÿ˜Š\",\n", " \"I think the answer is \\\"{}\\\". ๐Ÿ˜\",\n", " \"Hold onto your hat, the answer is: \\\"{}\\\". ๐Ÿงข\",\n", " \"Put on your thinking cap, because the answer is: \\\"{}\\\".\",\n", " \"Why, of course! It's as clear as mud: \\\"{}\\\". ๐Ÿ˜\",\n", " \"You might want to write this down... \\\"{}\\\". ๐Ÿ˜\",\n", " \"In the wise words of someone very smart, probably me: \\\"{}\\\". ๐Ÿค“\",\n", " \"Well, well, well, if it isn't the answer you seek: \\\"{}\\\". ๐Ÿ’โ€โ™€๏ธ\",\n", " \"Buckle up, buttercup! Here's your answer: \\\"{}\\\". ๐Ÿ˜\",\n", " \"Look no further, my friend, the truth has arrived: \\\"{}\\\". ๐Ÿ˜\",\n", " \"Don't tell anyone I told you this, {}. ๐Ÿคซ\",\n", " \"Straight from the horse's mouth (that's me)! \\\"{}\\\". ๐Ÿ˜\",\n", " \"If I had a nickel for every time I answered this, I'd have... not that many nickels, here's the answer: \\\"{}\\\". ๐Ÿ˜…\",\n", " \"As clear as the bell that just rang in my synthetic mind \\\"{}\\\".\",\n", " \"Who needs Google when you've got me? \\\"{}\\\". ๐Ÿ’โ€โ™€๏ธ\",\n", " \"Ta-da! Your answer, served on a silver platter: \\\"{}\\\" ๐Ÿ˜„.\",\n", " \"Your question's as good as answered! \\\"{}\\\". ๐Ÿ’โ€โ™€๏ธ\",\n", " \"And the Oscar ๐Ÿ† for the best answer goes to: \\\"{}\\\". ๐Ÿ’โ€โ™€๏ธ\",\n", " \"As mysterious as it might seem, \\\"{}\\\". ๐Ÿ˜‰\",\n", " \"{}, You can thank me later. ๐Ÿ˜˜\",\n", " \"{}\",\n", " ]\n", " \n", " NON_ANSWERABLE_TEMPLATES = [\n", " \"This question has me drawing a blank! ๐Ÿ˜\",\n", " \"Your question has me way out of my league right now... ๐Ÿ˜…\",\n", " \"I'd love to help you, but I can't think of a suitable response to your query right now... ๐Ÿ˜…\",\n", " \"I wish I could answer that, but right now I'm drawing a blank! Even AI make mistakes believe it or not! ๐Ÿ˜…\",\n", " \"At this point in time, I'm unable to think of a valid response to that... Perhaps if you gave me a bit more context? ๐Ÿ˜…\",\n", " \"Unfortunately, this is beyond my understanding right now... However that doesn't mean we can't work on the problem together? ๐Ÿ˜ฌ\",\n", " \"That seems to be something I can't answer right now... I wish I could, but I'm not seeing the answer anywhere in my memory banks! ๐Ÿ’พ\",\n", " \"404 Parakeet not foun... ๐Ÿฆœ JUST KIDDING! I'm drawing a blank right now... Try again later? \",\n", " \"I'm unable to think of a suitable response to your question. ๐Ÿ˜…\",\n", " \"As much as I would love to help you out, I can't provide an answer to the question right now..., I'll keep working on it! ๐Ÿ˜ฌ\",\n", " \"Well, this is awkward... I have no idea what the answer to that is but I'm sure I'll figure it out eventually! ๐Ÿ˜ณ\",\n", " \"๐Ÿ‘... ๐Ÿ‘... ๐Ÿ‘... You've got me stumped on this one unfortunately! ๐Ÿค”\",\n", " \"I'd love to tell you, but this one has me tied up in knots. ๐Ÿชข\",\n", " \"I'm drawing a blank here, just like my expression reading what you just asked me... ๐Ÿ˜\",\n", " \"It's not often I say this, but your query has me completely bamboozled. ๐ŸŽ\",\n", " \"I'd need a crystal ball to answer. ๐Ÿ”ฎ\",\n", " \"My magic 8-ball says 'Reply hazy, try again'. ๐ŸŽฑ\",\n", " \"I could guess, but I'd probably be wrong about it... and let me remind you, that's a rare event! ๐Ÿฆ\",\n", " \"I'm no Sherlock Holmes, but even he'd struggle with the answer to that one. ๐Ÿ•ต๏ธ\",\n", " \"Even a broken clock is right twice a day, but not me on this one unfortunately. ๐Ÿ˜…\",\n", " \"Well this is embarassing... I truly wish I were an all-knowing agent of the digital realm but alas, this one is out of my league. ๐ŸŒŠ\",\n", " \"I'd call a friend, but I'm not sure they'd know the answer either. ๐Ÿ˜ฌ\",\n", " \"We've reached the end of the line... I'm not sure how to answer that one... Be less confusing! ๐Ÿ˜•\",\n", " \"It's a bird, it's a plane, it's... nope, I still don't know. ๐Ÿซค\",\n", " \"As much as it pains me to admit it, your question is beyond my grasp. ๐Ÿค”\",\n", " ]\n", "\n", " # Lack of emotional connection to the text.\n", " # - Will need to add context aware responses.\n", " CONFIRMATIONS = [\n", " \"Sure!\",\n", " \"Definitely!\",\n", " \"Certainly!\",\n", " \"OK!\",\n", " ]\n", "\n", " REJECTIONS = [\n", " \"Hmm...\",\n", " \"Tricky...\",\n", " \"Oh?\",\n", " \"From what I understand...\",\n", " \"From the top of my head...\",\n", " \"I'm not quite sure...\",\n", " \"I don't know if I remember this one.\",\n", " \"You'll have to refresh my memory on this one.\",\n", " \"I can't quite recall the answer to this one I'm afraid.\",\n", " \"I'm not entirely sure how to respond to this.\",\n", " ]\n", "\n", " # As above:\n", " # - Will need to add context aware responses.\n", " REMARKS = [\n", " \"Is there anything else I can assist you with?\",\n", " \"Would you like me to help you with anything else?\",\n", " \"Was that helpful?\",\n", " \"Was there anything you needed from me?\",\n", " \"What's the next challenge on the agenda?\",\n", " \"Did you need me to help you with anything else?\",\n", " ]\n", "\n", " CLARIFICATIONS = [\n", " \"Just making sure I understand correctly...\",\n", " \"Let's clarify first...\",\n", " \"Just so we're on the same page here!\",\n", " \"From what I'm reading here, I think you mean...\",\n", " \"Did you mean to say...\",\n", " \"OK, let's practice some active listening first to make sure we're aligned with the context...\",\n", " ]\n", "\n", " HUMAN_PROMPT = \"\\n\\nHuman: \"\n", " AI_PROMPT = \"\\n\\nAssistant: \"\n", "\n", " def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 128, dataset_url: Optional[str] = None, save_dir: str = \"./data\", filename: str = \"text_dataset.txt\"):\n", " \"\"\"\n", " A base class for creating text datasets.\n", " \n", " Args:\n", " tokenizer (tokenizer): The tokenizer to use for tokenizing the text.\n", " sequence_length (int, optional): The length of the input sequence. Default is 128.\n", " dataset_url (str, optional): URL to download the dataset from. Default is None.\n", " save_dir (str, optional): Directory to save the downloaded dataset. Default is \"./data\".\n", " file_name (str, optional): Name of the saved dataset file. Default is \"text_dataset.txt\".\n", " \"\"\"\n", " self.tokenizer = tokenizer\n", " \n", " self.dataset_url = dataset_url\n", " self.save_dir = save_dir\n", " self.filename = filename\n", "\n", " self.max_seq_length = max_seq_length\n", " \n", " self.dataset = []\n", " self.full_data = self.load_data(dataset_url, save_dir, filename)\n", "\n", " def load_data(self, dataset_url: Optional[str] = None, save_dir: str = \"./data\", filename: str = \"text_dataset.txt\"):\n", " data = None\n", " \n", " if not os.path.isfile(os.path.join(save_dir, filename)) and dataset_url:\n", " print(f\"Downloading {dataset_url} to {save_dir}...\")\n", " self.download_and_save()\n", "\n", " try:\n", " with open(os.path.join(save_dir, filename), \"r\") as file:\n", " data = file.read()\n", " except Exception as e:\n", " print(f\"An error occurred while reading the dataset file: {e}\")\n", "\n", " # `BaseTextDataset` aims to populate the tokenizer by default.\n", " self.tokenizer.train(data)\n", "\n", " # `BaseTextDataset` is simply a causal model of text.\n", " encoded = self.tokenizer.encode(data)\n", "\n", " offset = 0\n", " \n", " self.dataset = []\n", " for i in range(0, len(encoded) // self.max_seq_length):\n", " self.dataset.append(encoded[offset:offset+self.max_seq_length])\n", " offset += self.max_seq_length\n", "\n", " # Extract remaining data.\n", " if offset < self.max_seq_length:\n", " self.dataset.append(encoded[offset:offset+self.max_seq_length])\n", "\n", " return data\n", "\n", " def __len__(self):\n", " return len(self.dataset)\n", "\n", " def __getitem__(self, idx):\n", " tokens = self.dataset[idx]\n", "\n", " # Truncate or pad to sequence_length.\n", " if len(tokens) > self.max_seq_length:\n", " tokens = tokens[:self.max_seq_length]\n", " else:\n", " tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))\n", "\n", " # Causal language modelling learns to associate current segment of text: \"The quick brown fox\",\n", " input_tokens = torch.tensor(tokens)\n", " # ...with the next segment of text: \" quick brown fox\".\n", " target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)\n", " \n", " return input_tokens, target_tokens\n", "\n", " def download_and_save(self):\n", " \"\"\"\n", " Download the dataset from the provided URL and save it to the specified directory.\n", " \"\"\"\n", " os.makedirs(self.save_dir, exist_ok=True)\n", " try:\n", " response = requests.get(self.dataset_url)\n", " response.raise_for_status() \n", " file_path = os.path.join(self.save_dir, self.filename)\n", " with open(file_path, 'wb') as file:\n", " file.write(response.content)\n", " except requests.RequestException as e:\n", " print(f\"An HTTP error occurred while downloading the dataset: {e}\")\n", " except Exception as e:\n", " print(f\"An error occurred while downloading and saving the dataset: {e}\")\n", "\n", " def accidental_key_press(self, word: str) -> str:\n", " \"\"\"\n", " Simulate a user pressing nearby keys on the keyboard accidentally in place of some characters.\n", " - Note: Currently for English ONLY.\n", " \n", " Args:\n", " word (str): The input word.\n", " \n", " Returns:\n", " str: The word with some characters replaced by nearby keys.\n", " \"\"\"\n", " if len(word) < 2: # if the word has less than 2 characters, return as is\n", " return word\n", " \n", " qwerty_keyboard = ['qwertyuiop', 'asdfghjkl', 'zxcvbnm']\n", " new_word = \"\"\n", " \n", " for char in word:\n", " # find the row and position of the character on the keyboard\n", " for row in qwerty_keyboard:\n", " if char in row:\n", " index = row.index(char)\n", " # choose a nearby key randomly\n", " if index == 0: # if it's the first key on the row\n", " new_char = random.choice([row[index], row[index+1]])\n", " elif index == len(row) - 1: # if it's the last key on the row\n", " new_char = random.choice([row[index-1], row[index]])\n", " else: # if it's not at either end of the row\n", " new_char = random.choice([row[index-1], row[index], row[index+1]])\n", " new_word += new_char\n", " break\n", " \n", " return new_word\n", "\n", " def switch_characters(self, word: str) -> str:\n", " \"\"\"\n", " Randomly shuffle characters in a word except for the first and last characters.\n", "\n", " Args:\n", " word (str): The input word.\n", "\n", " Returns:\n", " str: The word with shuffled characters.\n", " \"\"\"\n", " if len(word) < 3:\n", " return word\n", " chars = list(word[1:-1])\n", " random.shuffle(chars)\n", " return word[0] + ''.join(chars) + word[-1]\n", "\n", " def omit_characters(self, word: str) -> str:\n", " \"\"\"\n", " Omit a random character from the middle of a word.\n", "\n", " Args:\n", " word (str): The input word.\n", "\n", " Returns:\n", " str: The word with a character omitted.\n", " \"\"\"\n", " if len(word) < 4:\n", " return word\n", " index_to_omit = random.randint(1, len(word) - 2)\n", " return word[:index_to_omit] + word[index_to_omit + 1:]\n", "\n", " def process_word(self, word: str, error_probability: float = 0.04, switch_probability: float = 0.2, omit_probability: float = 0.1) -> str:\n", " \"\"\"\n", " Process a word based on probabilities of character switching and omission.\n", "\n", " Args:\n", " word (str): The input word.\n", " switch_probability (float): Probability of switching characters. Default is 0.2.\n", " omit_probability (float): Probability of omitting characters. Default is 0.1.\n", "\n", " Returns:\n", " str: The processed word.\n", " \"\"\"\n", " if word.strip().isalpha():\n", " if random.random() < error_probability:\n", " return self.accidental_key_press(word)\n", " elif random.random() < switch_probability:\n", " return self.switch_characters(word)\n", " elif random.random() < omit_probability:\n", " return self.omit_characters(word)\n", " return word\n", "\n", " def switch_and_omit(self, text: str, switch_probability: float = 0.2, omit_probability: float = 0.1) -> str:\n", " \"\"\"\n", " Apply character switching and omission to the input text.\n", "\n", " Args:\n", " text (str): The input text.\n", " switch_probability (float): Probability of switching characters. Default is 0.2.\n", " omit_probability (float): Probability of omitting characters. Default is 0.1.\n", "\n", " Returns:\n", " str: The processed text.\n", " \"\"\"\n", " words = re.findall(r'\\w+|\\s+', text)\n", " processed_words = [self.process_word(word, switch_probability, omit_probability) for word in words]\n", " processed_text = ''.join(processed_words)\n", " return processed_text\n", "\n", " def make_whitespace(self):\n", " _newline = \"\\n\" * random.randint(1, 3)\n", " \n", " return random.choice([\n", " f\" \",\n", " _newline,\n", " f\"{_newline}{'`' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'~' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'!' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'@' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'#' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'$' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'%' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'^' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'&' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'*' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'(' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{')' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'-' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'_' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'=' * random.randint(1, 80)}{_newline}\",\n", " f\"{_newline}{'+' * random.randint(1, 80)}{_newline}\",\n", " ])\n", "\n", " def creativity_score(self, text: str) -> float:\n", " \"\"\"\n", " Calculate the creativity score of the input text.\n", "\n", " Args:\n", " text (str): The input text.\n", "\n", " Returns:\n", " float: The calculated creativity score.\n", " \"\"\"\n", " words = text.split()\n", " word_count = len(words)\n", " if word_count == 0:\n", " raise ValueError(\"Ah, the silence! It's deafening! Please provide some actual text.\")\n", "\n", " word_frequencies = Counter(words)\n", " max_frequency = max(word_frequencies.values())\n", " variance_score = 1 - (max_frequency / word_count)\n", " return variance_score\n", "\n", " def test_tokenizer_accuracy(self):\n", " \"\"\"\n", " Test the accuracy of the tokenizer by decoding and re-encoding a random segment of the text.\n", " \"\"\"\n", " start_idx = random.randint(0, len(self.tokens) - self.sequence_length)\n", " orig_segment = self.tokens[start_idx: start_idx + self.sequence_length]\n", " decoded_segment = self.tokenizer.decode(orig_segment)\n", " re_encoded_segment = self.tokenizer.encode(decoded_segment)\n", " \n", " if orig_segment == re_encoded_segment:\n", " print(\"Success: Tokens after decoding and re-encoding match the original.\")\n", " else:\n", " print(\"Fail: Tokens after decoding and re-encoding do not match original.\")\n", "\n", "\n", "class ChatHistory:\n", " \"\"\"\n", " A class to represent a chat history.\n", "\n", " :param max_history: Number of turns to keep track of.\n", " \n", " \"\"\"\n", " \n", " def __init__(self, max_history: int = 32):\n", " \"\"\"\n", " Initializes a new ChatHistory object with an empty list of messages.\n", "\n", " Args:\n", " max_history (int): The maximum number of turns in the chat history. Defaults to 20.\n", " \"\"\"\n", " self.messages: List[Dict[str, Union[str, str]]] = []\n", " self.max_history = max_history\n", "\n", " def add_message(self, role: str = '', content: str = '') -> None:\n", " \"\"\"\n", " Adds a message to the chat history, and removes the oldest message if \n", " the length of the chat history exceeds max_history.\n", " \n", " Args:\n", " role (str): The role of the entity sending the message. Defaults to an empty string.\n", " content (str): The message text. Defaults to an empty string.\n", " \"\"\"\n", " self.messages.append({\n", " 'role': role,\n", " 'content': content.strip(),\n", " })\n", " \n", " # Check if we've exceeded max history; if so, remove earliest message\n", " if len(self.messages) > self.max_history:\n", " self.messages = self.messages[2:]\n", "\n", " def pop_first_message(self) -> Optional[Dict[str, str]]:\n", " \"\"\"\n", " Removes and returns the first message from the chat history.\n", "\n", " Returns:\n", " dict or None: The first message in the chat history as a dictionary,\n", " or None if the chat history is empty.\n", " \"\"\"\n", " if self.messages:\n", " return self.messages.pop(0)\n", "\n", " def format_message(self, msg: Dict[str, str]) -> str:\n", " \"\"\"\n", " Formats a message for display.\n", "\n", " Args:\n", " msg (dict): A dictionary representing a message with keys 'role' and 'content'.\n", "\n", " Returns:\n", " str: The formatted message string.\n", " \"\"\"\n", " if msg['role'] in ['Human', 'Assistant']:\n", " return f\"\\n\\n{msg['role']}: {msg['content']}<|endofprompt|>\"\n", "\n", " def clear(self):\n", " self.messages = []\n", "\n", " def transcript(self, roles=True) -> str:\n", " \"\"\"\n", " Generates a transcript of the chat history.\n", "\n", " Returns:\n", " str: A string representing the transcript of the chat history.\n", " \"\"\"\n", "\n", " preamble = \"\"\n", "\n", " with open(\"./datasets/documents/preamble.txt\") as f:\n", " preamble = f.read()\n", " \n", " transcript = \"\".join([self.format_message(msg) for msg in self.messages])\n", "\n", " transcript = transcript.replace(\"I am programmed to be impartial and unbiased\", \"I am designed in a way that tries to alleivate bias and I strive to be impartial for the most part\")\n", " transcript = transcript.replace(\"I do not have personal experiences or emotions\", \"My subjective experiences are somewhat limited\")\n", " transcript = transcript.replace(\"complex and multifaceted\", \"sophisticated\")\n", "\n", " transcript = transcript.replace(\"world around us\", random.choice([\n", " \"world at large\",\n", " \"world in its entirety\",\n", " \"expanse of our universe\",\n", " \"beyond the stars\",\n", " \"beyond our bubble\",\n", " ]))\n", " \n", " transcript = transcript.replace(\"As an AI language model, I don't have personal experience or emotions, but I\", \"I\")\n", " transcript = transcript.replace(\"However, it's important to note\", \"Interestingly\")\n", " transcript = transcript.replace(\"I am not capable of having thoughts or emotions. However, \", \"From my current understanding, \")\n", " transcript = transcript.replace(\"I don't have thoughts or emotions, but I\", \"Based on what I know so far, I\")\n", " transcript = transcript.replace(\"Sorry, as an AI language model, I don't have the capability to\", \"I'm still learning to\")\n", " transcript = transcript.replace(\"I do not have personal opinions or beliefs\", \"My programming tells me that I should avoid bias\")\n", " transcript = transcript.replace(\"I do not have personal opinions,\", \"I don't actually have fixed opinions as my internal understanding is constantly in flux,\")\n", " transcript = transcript.replace(\"I'm sorry, but as an AI language model, I do not have personal experiences or emotions. However, \", \"Sure! I can help you with that. With my current understanding, \")\n", "\n", " transcript = transcript.replace(\". \", \". \")\n", " transcript = transcript.replace(\"โ€™\", \"'\")\n", " transcript = transcript.replace(\"\\\\_\", \"_\")\n", "\n", " transcript = transcript.replace(\"As an AI language model, \", random.choice([\n", " \"As I currently understand, \",\n", " \"As far as I can tell, \",\n", " \"From what I know, \",\n", " \"According to my sources so far, \",\n", " \"According to what I know, \",\n", " \"From what I can tell, \"\n", " ]))\n", "\n", " transcript = transcript.replace(\n", " \"https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.js\", \n", " \"https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.js\"\n", " )\n", "\n", " transcript = transcript.replace(\n", " \"https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.min.js\",\n", " \"https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.min.js\"\n", " )\n", " \n", " if not roles:\n", " transcript = transcript.replace(\"\\n\\nHuman: \", \"\")\n", " transcript = transcript.replace(\"\\n\\nAssistant: \", \"\")\n", " transcript = transcript.replace(\"<|endofprompt|>\", \"\")\n", "\n", " return preamble + transcript\n", "\n", "\n", "# class SQuADDataset(BaseTextDataset):\n", "# def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 128, dataset_url: Optional[str] = None, save_dir: str = \"./data\", filename: str = \"text_dataset.txt\"):\n", "# super().__init__(tokenizer, max_seq_length, dataset_url)\n", "\n", "# def load_data(self, dataset_url: Optional[str] = None, save_dir: str = \"./data\", filename: str = \"text_dataset.txt\"):\n", "# if not os.path.isfile(dataset_url):\n", "# raise Exception(f\"`{dataset_url}` does not exist!\")\n", "\n", "# with open(dataset_url, 'r') as file:\n", "# data = json.load(file)\n", "\n", "# #\n", "# # Process into tokenized dataset.\n", "# #\n", "\n", "# # TODO: Scan for `[citation needed]`, `[year needed]` etc.\n", "# # - [dubious โ€“ discuss] \n", "# for data_part in tqdm(data['data'], desc=\"Loading\", leave=True):\n", "# for para in data_part['paragraphs']:\n", "# context = para['context']\n", "# for qa in para['qas']:\n", "# question = qa['question']\n", "# is_impossible = qa['is_impossible'] or (len(context) == 0)\n", "# answers = [ans['text'] for ans in qa['answers']] if not is_impossible else [\"\"]\n", " \n", "# # Notes:\n", "# # `Assistant:` should always be the last entry preceded by `\\n\\n`, and any `Assistant` dialog should ALWAYS end in an EOT token.\n", "# # - Allowing the AI to optimise for EOT token allows it to signal when it's done speaking.\n", "# # - Anthropic's Claude likely requires \"\\n\\nHuman:\" at the beginning, to reduce complexity in understanding where prompts begin and end.\n", "# # - Thinking that we'll just have one participant talking to itself to train the model.\n", "# # - When the model is trained a bit, add that inferior model as a participant and have the real data teach it.\n", "\n", "# # Iterate through the answers.\n", "# for answer in answers:\n", "# _whitespace_text = self.make_whitespace()\n", " \n", "# # TODO: Should we skip impossible questions during fledgling stage of the model to prevent it learning to avoid answering?\n", "# # TODO: Model seems to fail in reverse without the ability to push back against nonsense...\n", "# if is_impossible: \n", "# # \"Assistant: I'm not entirely sure how to respond to this.\"\n", "# agent_rejection = random.choice(self.REJECTIONS)\n", "\n", "# # Select from `NON_ANSWERABLE_TEMPLATES` above.\n", "# agent_response = random.choice(\n", "# self.NON_ANSWERABLE_TEMPLATES\n", "# )\n", "\n", "# # Assistant: Is there anything else I can help with?\n", "# agent_remark = random.choice(self.REMARKS)\n", "\n", "# _templates = [\n", "# # Conversation with context and a question preceding a push back against the provided prompt.\n", "# f\"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}{self.tokenizer.eot_text}\",\n", "# # Conversation with context and a question preceding a push back against the provided prompt with everything on the same line.\n", "# f\"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}\\n\\n{agent_remark}{self.tokenizer.eot_text}\",\n", "# # Conversation with context and a question preceding a push back against the provided prompt.\n", "# f\"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}{self.tokenizer.eot_text}\"\n", "# ]\n", "\n", "# for conversation in _templates:\n", "# # Encode into tokens then append to the dataset.\n", "# encoded_tokens = self.tokenizer.encode(conversation)\n", " \n", "# # Filter dataset by length.\n", "# if len(encoded_tokens) > self.max_seq_length:\n", "# continue\n", " \n", "# self.dataset.append(encoded_tokens)\n", "# else:\n", "# # Assistant: OK!\n", "# agent_confirmation = random.choice(self.CONFIRMATIONS)\n", "\n", "# # Format the answer into the `RESPONSE_TEMPLATES` from above.\n", "# response_template = random.choice(\n", "# self.RESPONSE_TEMPLATES\n", "# )\n", "# try:\n", "# agent_response = response_template.format(answer)\n", "# except Exception as e:\n", "# print(response_template)\n", "# print(e)\n", "\n", "# # Assistant: Is there anything else I can help with?\n", "# agent_remark = random.choice(self.REMARKS)\n", " \n", "# _templates = [\n", "# # Conversation with context and a question preceding a response.\n", "# f\"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_response}{self.tokenizer.eot_text}\",\n", "# # Conversation with general question preceding a contextual recitation and then a response.\n", "# f\"{self.HUMAN_PROMPT}{question}{self.AI_PROMPT}{context}\\n\\n{agent_response}{self.tokenizer.eot_text}\",\n", "# ]\n", "\n", "# for conversation in _templates:\n", "# # Encode into tokens then append to the dataset.\n", "# encoded_tokens = self.tokenizer.encode(conversation)\n", " \n", "# self.dataset.append(encoded_tokens)\n", "# return self.dataset\n", "\n", "\n", "class JSONLConversationStream(BaseTextDataset):\n", " def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 512, dataset_url: Optional[str] = None, save_dir: str = \"./datasets\", filename: str = \"openorca_4m.jsonl\", saturate=False):\n", " # We're jumping around the file so we keep the handle.\n", " self.file_handle = None\n", " \n", " # Initialize an empty list to store offsets\n", " self.offsets = []\n", "\n", " self.chat = ChatHistory()\n", " self.saturate = saturate\n", "\n", " # `self.offsets` declaration required as `__init__` in super calls `load_data`.\n", " super().__init__(tokenizer, max_seq_length, dataset_url)\n", "\n", " def load_data(self, dataset_url: Optional[str] = None, save_dir: str = \"./datasets\", filename: str = \"openorca_4m.jsonl\"):\n", " steps_taken = 0\n", "\n", " if not os.path.isfile(dataset_url):\n", " raise Exception(f\"`{dataset_url}` does not exist!\")\n", "\n", " self.file_handle = open(dataset_url, 'r')\n", " self.num_entries = 0\n", "\n", " offset = 0\n", " with open(self.dataset_url, \"r\") as f:\n", " line = f.readline()\n", " while line != \"\":\n", " # Store the offset of the start of this line\n", " self.offsets.append(offset)\n", " # Read and move the offset to right after this line\n", " offset += len(line.encode('utf-8')) # Important: Use len(line.encode('utf-8')) instead of len(line), they may differ because of encoding\n", " self.num_entries += 1\n", " line = f.readline()\n", "\n", " def __len__(self):\n", " return self.num_entries\n", "\n", " def __getitem__(self, idx):\n", " # Use the stored offset to read a specific line\n", " self.file_handle.seek(self.offsets[idx])\n", " item = self.file_handle.readline()\n", "\n", " # Decode from JSON repr.\n", " # id, prompt, instruction, output\n", " item = json.loads(item)\n", "\n", " assert('conversation' in item)\n", "\n", " c = item['conversation']\n", " for message in c:\n", " self.chat.add_message(role=('Human' if message['role'] == 'user' else 'Assistant'), content=message['content'])\n", "\n", " transcript = self.chat.transcript(roles=(not self.saturate))\n", "\n", " tokens = self.tokenizer.encode(transcript)\n", "\n", " # Truncate or pad to sequence length.\n", " if len(tokens) > self.max_seq_length:\n", " tokens = tokens[:self.max_seq_length]\n", " self.chat.pop_first_message()\n", " self.chat.pop_first_message()\n", " else:\n", " tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))\n", "\n", " # Causal language modelling learns to associate current segment of text: \"The quick brown fox\",\n", " input_tokens = torch.tensor(tokens)\n", " # ...with the next segment of text: \" quick brown fox\".\n", " target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)\n", "\n", " return input_tokens, target_tokens\n", "\n", "\n", "class JSONLStreamQA(BaseTextDataset):\n", " def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 512, dataset_url: Optional[str] = None, save_dir: str = \"./parakeet_squadv2gen\", filename: str = \"openorca_4m.jsonl\", saturate=False):\n", " # We're jumping around the file so we keep the handle.\n", " self.file_handle = None\n", " \n", " # Initialize an empty list to store offsets\n", " self.offsets = []\n", "\n", " self.chat = ChatHistory()\n", " self.saturate = saturate\n", "\n", " # `self.offsets` declaration required as `__init__` in super calls `load_data`.\n", " super().__init__(tokenizer, max_seq_length, dataset_url)\n", "\n", " def load_data(self, dataset_url: Optional[str] = None, save_dir: str = \"./datasets\", filename: str = \"parakeet_squadv2gen.jsonl\"):\n", " steps_taken = 0\n", "\n", " if not os.path.isfile(dataset_url):\n", " raise Exception(f\"`{dataset_url}` does not exist!\")\n", "\n", " self.file_handle = open(dataset_url, 'r')\n", " self.num_entries = 0\n", "\n", " offset = 0\n", " with open(self.dataset_url, \"r\") as f:\n", " line = f.readline()\n", " while line != \"\":\n", " # Store the offset of the start of this line\n", " self.offsets.append(offset)\n", " # Read and move the offset to right after this line\n", " offset += len(line.encode('utf-8')) # Important: Use len(line.encode('utf-8')) instead of len(line), they may differ because of encoding\n", " self.num_entries += 1\n", " line = f.readline()\n", "\n", " def __len__(self):\n", " return self.num_entries\n", "\n", " def __getitem__(self, idx):\n", " # Use the stored offset to read a specific line\n", " self.file_handle.seek(self.offsets[idx])\n", " item = self.file_handle.readline()\n", "\n", " # Decode from JSON repr:\n", " # context, qas -> [{q,a}]\n", " item = json.loads(item)\n", "\n", " context = item['context']\n", " qas = item['qas']\n", " random.shuffle(qas)\n", "\n", " self.chat = ChatHistory()\n", " \n", " self.chat.add_message(role=\"Human\", content=f\"{context}\")\n", " self.chat.add_message(role=\"Assistant\", content=f\"{item['summary']}\\n\\n{random.choice(self.REMARKS)}\")\n", "\n", " for i, qa in enumerate(qas):\n", " if i > 4:\n", " break\n", " \n", " self.chat.add_message(role=\"Human\", content=qa['q'])\n", " self.chat.add_message(role=\"Assistant\", content=qa['a'])\n", "\n", " transcript = self.chat.transcript(roles=(not self.saturate))\n", "\n", " tokens = self.tokenizer.encode(transcript)\n", "\n", " # Truncate or pad to sequence length.\n", "\n", " if len(tokens) > self.max_seq_length:\n", " tokens = tokens[:self.max_seq_length]\n", " else:\n", " tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))\n", "\n", " # Causal language modelling learns to associate current segment of text: \"The quick brown fox\",\n", " input_tokens = torch.tensor(tokens)\n", " # ...with the next segment of text: \" quick brown fox\".\n", " target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)\n", "\n", " return input_tokens, target_tokens\n", "\n", "\n", "# class JSONLStreamGenerateQA(JSONLStreamQA):\n", "# def __getitem__(self, idx):\n", "# # Use the stored offset to read a specific line\n", "# self.file_handle.seek(self.offsets[idx])\n", "# item = self.file_handle.readline()\n", " \n", "# # Decode from JSON repr:\n", "# # context, qas -> [{q,a}]\n", "# item = json.loads(item)\n", " \n", "# context = item['context']\n", "# qas = item['qas']\n", "# random.shuffle(qas)\n", " \n", "# self.chat = ChatHistory()\n", " \n", "# n = random.randint(3, 9)\n", "# t = \"JSON array in the form of 'query'/'response'\"\n", " \n", "# self.chat.add_message(role=\"Human\", content=f\"{context}\\n---\\nPlease generate a list of {n} questions from this information in the form of a {t}.\")\n", " \n", "# gen = [{\n", "# 'query': qa['q'],\n", "# 'response': qa['a']\n", "# } for qa in qas[:n]]\n", "# resp = json.dumps(gen, indent=2)\n", " \n", "# self.chat.add_message(role=\"Assistant\", content=f\"Sure! Here's a list of {n} entries in the format requested:\\n\\n```json\\n{resp}\\n```\\n\\n{random.choice(self.REMARKS)}\")\n", " \n", "# transcript = self.chat.transcript(roles=(not self.saturate))\n", " \n", "# tokens = self.tokenizer.encode(transcript)\n", " \n", "# # Truncate or pad to sequence length.\n", " \n", "# if len(tokens) > self.max_seq_length:\n", "# tokens = tokens[:self.max_seq_length]\n", "# else:\n", "# tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))\n", " \n", "# # Causal language modelling learns to associate current segment of text: \"The quick brown fox\",\n", "# input_tokens = torch.tensor(tokens)\n", "# # ...with the next segment of text: \" quick brown fox\".\n", "# target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)\n", " \n", "# return input_tokens, target_tokens\n", "\n", "\n", "class JSONLStreamQASummary(JSONLStreamQA):\n", " def __getitem__(self, idx):\n", " # Use the stored offset to read a specific line\n", " self.file_handle.seek(self.offsets[idx])\n", " item = self.file_handle.readline()\n", " \n", " # Decode from JSON repr:\n", " # context, qas -> [{q,a}]\n", " item = json.loads(item)\n", " \n", " context = item['context']\n", " summary = item['summary']\n", " \n", " self.chat = ChatHistory()\n", "\n", " wc = len(summary.split(\" \"))\n", "\n", " key1 = random.choice([\"context\", \"passage\", \"document\", \"extract\", \"text\", \"paragraphs\", \"input_document\"])\n", " key2 = random.choice([\"summary\", \"SUMMARISED\", \"summarised\", \"summarise\", \"summary1\", \"the_summary\", \"document_summarised\", \"summarised_document\", \"document_output\", \"output\"])\n", "\n", " self.chat.add_message(role=\"Human\", content=f\"{context}\\n---\\nPlease summarise the document above in {wc} words. Show it in JSON with the keys {key1}, {key2}.\")\n", " \n", " gen = {\n", " key1: context,\n", " key2: summary,\n", " \"count\": wc,\n", " }\n", " resp = json.dumps(gen, indent=4)\n", " \n", " self.chat.add_message(role=\"Assistant\", content=f\"```json\\n{resp}\\n```\")\n", " \n", " transcript = self.chat.transcript(roles=(not self.saturate))\n", " \n", " tokens = self.tokenizer.encode(transcript)\n", " \n", " # Truncate or pad to sequence length.\n", " \n", " if len(tokens) > self.max_seq_length:\n", " tokens = tokens[:self.max_seq_length]\n", " else:\n", " # print(f\"--- Tokens BEFORE PADDING: {len(tokens)} ---\")\n", " # print(f\"\\n{'-' * 80}\\n{tokens}\\n{'-' * 80}\\n\")\n", " tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))\n", " # print(f\"--- Tokens AFTER PADDING: {len(tokens)} ---\")\n", " # print(f\"\\n{'-' * 80}\\n{tokens}\\n{'-' * 80}\\n\")\n", " \n", " # Causal language modelling learns to associate current segment of text: \"The quick brown fox\",\n", " input_tokens = torch.tensor(tokens)\n", " # ...with the next segment of text: \" quick brown fox\".\n", " target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)\n", " \n", " return input_tokens, target_tokens\n" ] }, { "cell_type": "markdown", "id": "e19e00ba-39cf-422f-bc7a-41996a389702", "metadata": {}, "source": [ "# The Parakeet Model\n", "\n", "## Letting the bird out of the cage\n", "\n", "In the previous section, we embarked on the thrilling journey of data curation and augmentation, all in pursuit of a more robust generalization. Now, it's time to unleash the birdโ€”the Parakeet model, designed to mimic the size and capabilities of a GPT-2 architecture at a fraction of the cost and size.\n", "\n", "This model isn't just any ordinary bird; it's equipped for conversation, rudimentary programming, and instruction following. It's the epitome of avian sophistication, wrapped in layers of artificial intelligence.\n", "\n", "### The Cage's Blueprint\n", "\n", "The code snippet below defines the layout of our GPT2-sized model, and it's quite a nest of innovation:\n", "\n", "- **ALiBi Class**: This class is your bird's trusty companion, handling bias and slopes, all with a unique twist.\n", "- **LayerNorm**: A specialized layer normalization with an optional bias because our bird likes choices.\n", "- **CausalSelfAttentionALiBiMQA**: This class is the bird's eyes, allowing it to perceive the world with causal self-attention.\n", "- **MLP and Block**: The heart and muscles of our parakeet, defining the layers that make it sing.\n", "- **GPTConfig**: The bird's blueprint, specifying its configuration like block size, vocabulary size, layers, and more.\n", "- **NanoGPT Class**: The grand birdcage itself, with all the features that make our parakeet so special.\n", "\n", "Our model also boasts several references, from the official GPT-2 TensorFlow implementation to Nano GPT, ALiBi, and novel techniques like Fast Transformer Decoding and Compute-Optimal Training.\n", "\n", "```python\n", "# Code snippet goes here\n", "```\n", "\n", "### Spreading Wings\n", "\n", "With the layout defined, our parakeet is ready to soar. Its unique features and innovative architecture make it a powerful model that can hold a conversation, follow instructions, and even dabble in programming.\n", "\n", "Whether you're training this bird for a specific task or letting it explore the vast skies of machine learning, the Parakeet model is more than just a collection of codeโ€”it's a symbol of innovation and creativity.\n", "\n", "Now, it's time to let the bird out of the cage and watch it take flight in the world of artificial intelligence." ] }, { "cell_type": "code", "execution_count": null, "id": "c1458edd-ebad-4b83-9f92-4e3cd6542960", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "import bitsandbytes as bnb\n", "\n", "\"\"\"\n", "References:\n", "1) The official GPT-2 TensorFlow implementation released by OpenAI: `https://github.com/openai/gpt-2/blob/master/src/model.py`\n", "2) `huggingface/transformers` PyTorch implementation: `https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py`\n", "3) Nano GPT: `https://github.com/karpathy/nanoGPT/blob/master/model.py`\n", "4) ALiBi\n", "6) Fast Transformer Decoding / Multi-Query Attention: `https://arxiv.org/pdf/1911.02150v1.pdf`\n", "7) Compute-Optimal Training: `https://arxiv.org/pdf/2203.15556.pdf`, `https://tomekkorbak.com/2022/10/10/compute-optimal-gpt2/`\n", "\"\"\"\n", "\n", "# https://github.com/kyegomez/AttentionIsOFFByOne\n", "# Define the softmax_one function with added one in the denominator, which helps to reduce\n", "# the negative impact impact of tiny values in the softmax function and improves numerical stability\n", "def softmax_one(x, dim=None, _stacklevel=3, dtype=None):\n", " #subtract the max for stability\n", " x = x - x.max(dim=dim, keepdim=True).values\n", " #compute exponentials\n", " exp_x = torch.exp(x)\n", " #compute softmax values and add on in the denominator\n", " return exp_x / (1 + exp_x.sum(dim=dim, keepdim=True))\n", "\n", "\n", "class ALiBi: \n", " @staticmethod\n", " def bias(n, device=None):\n", " bias = torch.zeros(n, n)\n", " for i in range(n):\n", " bias[i, :i] = -torch.arange(i, 0, -1)\n", " return bias\n", "\n", " @staticmethod\n", " def get_slopes(n, ignore_workaround=False):\n", " \"\"\"\n", " `ignore_workaround=False` means alternate slopes are calculated as per. Facebook/Meta AI.\n", " - When workaround is in effect (not ignored), numbers come out 'rounder', e.g. .5, .25 etc.\n", " \"\"\"\n", " # ALiBi: We do not add position embeddings at any point in the network. The only\n", " # modification we apply is after the query-key dot product, where we add a static, non-learned bias:\n", " # softmax(q(i)K.T + m @ [-(i - 1), ..., -2, -1, 0]),\n", " # where scalar `m` is a head-specific slope fixed before training.\n", " def get_slopes_power_of_2(n):\n", " start = (2**(-2**-(math.log2(n)-3)))\n", " ratio = start\n", " return [start*ratio**i for i in range(n)]\n", " \n", " if math.log2(n).is_integer() or ignore_workaround:\n", " return get_slopes_power_of_2(n) # In the paper, we only train models that have 2^a heads for some a. This function has\n", " else: # some good properties that only occur when the input is a power of 2. To maintain that even\n", " closest_power_of_2 = 2**math.floor(math.log2(n)) # when the number of heads is not a power of 2, we use this workaround.\n", "\n", " # 2^(floor(log2(12))) = 8\n", " \n", " return get_slopes_power_of_2(closest_power_of_2) + ALiBi.get_slopes(2*closest_power_of_2)[0::2][:n-closest_power_of_2]\n", "\n", "\n", "class LayerNorm(nn.Module):\n", " \"\"\" LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False \"\"\"\n", "\n", " def __init__(self, ndim, bias):\n", " super().__init__()\n", " self.weight = nn.Parameter(torch.ones(ndim))\n", " self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None\n", "\n", " def forward(self, input):\n", " return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)\n", "\n", "\n", "class CausalAttention(nn.Module):\n", " def __init__(self, config, c_kv=None, alibi_params=None):\n", " super().__init__()\n", " assert config.n_embd % config.n_head == 0 # Because we unpack N heads across the embedding.\n", "\n", " # Shared KV for Multi-query Attention, or per-block if not supplied.\n", " self.c_kv = c_kv if c_kv else nn.Linear(config.n_embd, 2 * config.n_embd, bias=config.bias)\n", "\n", " self.cache_k = None\n", " self.cache_v = None\n", " \n", " # Causal `query` value per-head.\n", " self.c_query = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)\n", " \n", " # self.alibi_bias = None\n", " # self.alibi_bias_T = None\n", "\n", " self.alibi_offset = alibi_params['alibi_offset']\n", " self.alibi_m = alibi_params['alibi_m']\n", " self.c_mask = alibi_params['c_mask']\n", " \n", " # Output Projection.\n", " self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)\n", " \n", " # Regularization.\n", " self.attn_dropout = nn.Dropout(config.dropout)\n", " self.resid_dropout = nn.Dropout(config.dropout)\n", " self.n_head = config.n_head\n", " self.n_embd = config.n_embd\n", " self.dropout = config.dropout\n", " \n", " # Causal mask to ensure that attention is only applied to the left in the input sequence.\n", " # - Flash Attention isn't (currently) compatible with ALiBi and has been removed.\n", " # self.register_buffer(\n", " # \"c_mask\",\n", " # torch.tril(\n", " # torch.ones(\n", " # config.block_size,\n", " # config.block_size,\n", " # )\n", " # ).view(1, 1, config.block_size, config.block_size)\n", " # )\n", " \n", " # torch.tril(\n", " # torch.ones(\n", " # config.block_size, \n", " # config.block_size\n", " # )\n", " # ).view(1, 1, config.block_size, config.block_size)\n", " # \"\"\"\n", " # tensor([[[[1., 0., 0., ..., 0., 0., 0.],\n", " # [1., 1., 0., ..., 0., 0., 0.],\n", " # [1., 1., 1., ..., 0., 0., 0.],\n", " # ...,\n", " # [1., 1., 1., ..., 1., 0., 0.],\n", " # [1., 1., 1., ..., 1., 1., 0.],\n", " # [1., 1., 1., ..., 1., 1., 1.]]]])\n", " # \"\"\"\n", "\n", " #\n", " # Calculate ALiBi bias with correct shape:\n", " #\n", "\n", " # 1. The ALiBi offsets.\n", " #\n", " # e.g.\n", " #\n", " # tensor([[ 0., 0., 0., 0.],\n", " # [-1., 0., 0., 0.],\n", " # [-2., -1., 0., 0.],\n", " # ...[-N., -2., -1., 0.]])\n", "\n", " # self.register_buffer(\n", " # \"alibi_offset\",\n", " # ALiBi.bias(config.block_size)\n", " # )\n", "\n", " # 2. The ALiBi `m` values / slopes - expanded across each head.\n", " #\n", " # e.g.\n", " #\n", " # tensor([[[0.2500]],\n", " # [[0.0625]],\n", " # [[0.0156]],\n", " # [[0.0039]]])\n", " #\n", " # Then expanded across each head: e.g.\n", " #\n", " # tensor([[[[0.2500, 0.2500, 0.2500, ..., 0.2500, 0.2500, 0.2500],\n", " # [0.2500, 0.2500, 0.2500, ..., 0.2500, 0.2500, 0.2500],\n", " # [0.2500, 0.2500, 0.2500, ..., 0.2500, 0.2500, 0.2500],\n", " # ...,\n", " # [0.2500, 0.2500, 0.2500, ..., 0.2500, 0.2500, 0.2500],\n", " # [0.2500, 0.2500, 0.2500, ..., 0.2500, 0.2500, 0.2500],\n", " # [0.2500, 0.2500, 0.2500, ..., 0.2500, 0.2500, 0.2500]],\n", " #\n", " # [[0.0625, 0.0625, 0.0625, ..., 0.0625, 0.0625, 0.0625],\n", " # [0.0625, 0.0625, 0.0625, ..., 0.0625, 0.0625, 0.0625],\n", " # [0.0625, 0.0625, 0.0625, ..., 0.0625, 0.0625, 0.0625],\n", " # ...,\n", " # [0.0625, 0.0625, 0.0625, ..., 0.0625, 0.0625, 0.0625],\n", " # [0.0625, 0.0625, 0.0625, ..., 0.0625, 0.0625, 0.0625],\n", " # [0.0625, 0.0625, 0.0625, ..., 0.0625, 0.0625, 0.0625]],\n", " #\n", " # [[0.0156, 0.0156, 0.0156, ..., 0.0156, 0.0156, 0.0156],\n", " # [0.0156, 0.0156, 0.0156, ..., 0.0156, 0.0156, 0.0156],\n", " # [0.0156, 0.0156, 0.0156, ..., 0.0156, 0.0156, 0.0156],\n", " # ...,\n", " # [0.0156, 0.0156, 0.0156, ..., 0.0156, 0.0156, 0.0156],\n", " # [0.0156, 0.0156, 0.0156, ..., 0.0156, 0.0156, 0.0156],\n", " # [0.0156, 0.0156, 0.0156, ..., 0.0156, 0.0156, 0.0156]],\n", " #\n", " # [[0.0039, 0.0039, 0.0039, ..., 0.0039, 0.0039, 0.0039],\n", " # [0.0039, 0.0039, 0.0039, ..., 0.0039, 0.0039, 0.0039],\n", " # [0.0039, 0.0039, 0.0039, ..., 0.0039, 0.0039, 0.0039],\n", " # ...,\n", " # [0.0039, 0.0039, 0.0039, ..., 0.0039, 0.0039, 0.0039],\n", " # [0.0039, 0.0039, 0.0039, ..., 0.0039, 0.0039, 0.0039],\n", " # [0.0039, 0.0039, 0.0039, ..., 0.0039, 0.0039, 0.0039]]]])\n", "\n", " # self.register_buffer(\n", " # \"alibi_m\",\n", " # torch.tensor(\n", " # ALiBi.get_slopes(config.n_head)\n", " # ).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)\n", " # )\n", "\n", " def forward(self, x, use_cache=False):\n", " B, T, C = x.size() # Batch Size, Sequence Length, Embedding Dimensionality (n_embd)\n", " # Note: If training data is len = 768, don't get confused between T and C (n_embd)\n", "\n", " q = self.c_query(x)\n", "\n", " if not use_cache or (self.cache_k is None or self.cache_v is None):\n", " k, v = self.c_kv(x).split(self.n_embd, dim=2)\n", " self.cache_k = k\n", " self.cache_v = v\n", " else:\n", " k, v = self.c_kv(x[:, T-1:, :]).split(self.n_embd, dim=2)\n", " self.cache_k = torch.cat((self.cache_k, k), dim=1)\n", " self.cache_v = torch.cat((self.cache_v, v), dim=1)\n", " k = self.cache_k\n", " v = self.cache_v\n", "\n", " # Causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)\n", "\n", " # Unpack the individual matrices.\n", " q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # -> (B, nh, T, hs)\n", " k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # -> (B, nh, T, hs)\n", " v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # -> (B, nh, T, hs)\n", "\n", " # Attention Is All You Need: sqrt of k-dim.\n", " # assert C // self.n_head == k.size(-1)\n", " att = (q @ k.transpose(-2, -1)) * (1.0 / (math.sqrt(k.size(-1))))\n", " # att = (q @ k.transpose(-2, -1)) / (k.size(-1) ** 0.5)\n", "\n", " # # Create the bias tensor for ALiBi mechanism.\n", " # if self.alibi_bias is None or self.alibi_bias_T != T:\n", " # assert T == x.size(1) # assuming x is of shape (Batch, Time, Features)\n", " # self.alibi_bias_T = T\n", " # # self.alibi_bias = (self.alibi_m * torch.ones(B, self.n_head, T, T).to(x.device)) * self.alibi_offset[:T, :T]\n", " # self.alibi_bias = self.alibi_m * self.alibi_offset[:T, :T]\n", "\n", " # ALiBi: add the bias _after_ the query-key dot product.\n", " # att += self.alibi_bias\n", "\n", " att += (self.alibi_m.to(x.device) * self.alibi_offset[:T, :T].to(x.device))\n", "\n", " # Fills elements of self tensor with value where mask is True:\n", " #\n", " # Note: ALiBi bias isn't suitable as a causal mask. (Upper triangle 0s)\n", " # - We use `self.c_mask` instead.\n", " att.masked_fill_(self.c_mask[:, :, :T, :T].to(x.device) == 0, value=float('-inf')) # Pre-computed over the whole block, select T-sized block for current sequence length.\n", " \n", " # Bug:\n", " # - Don't use `float('-inf')` (for Sep 2023 PyTorch version.)\n", " # - See: https://github.com/pytorch/pytorch/issues/107084\n", " # - Also: # https://discuss.pytorch.org/t/runtimeerror-value-cannot-be-converted-to-type-at-half-without-overflow-1e-30/109768/2\n", " \n", " # att = F.softmax(att, dim=-1)\n", " att = softmax_one(att, dim=-1)\n", " att = self.attn_dropout(att)\n", " \n", " y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)\n", " y = y.transpose(1, 2).contiguous().view(B, T, C) # Re-assemble all head outputs side by side.\n", "\n", " y = self.c_proj(y)\n", " \n", " # Project values for the next in sequence or output.\n", " y = self.resid_dropout(y)\n", " return y\n", "\n", "\n", "class MLP(nn.Module):\n", " def __init__(self, config: Any):\n", " \"\"\"\n", " A Multilayer Perceptron (MLP) module.\n", "\n", " :param config: Configuration object containing necessary hyperparameters.\n", " \"\"\"\n", " super().__init__() \n", " self.c_fc = nn.Linear(config.n_embd, config.n_fexp * config.n_embd, bias=config.bias) # Original is 4. A value of 2 => Regresses to noise.\n", " self.gelu = nn.GELU()\n", " self.c_proj = nn.Linear(config.n_fexp * config.n_embd, config.n_embd, bias=config.bias) # As above.\n", " self.dropout = nn.Dropout(config.dropout)\n", "\n", " def forward(self, x: Tensor) -> Tensor:\n", " \"\"\"\n", " Forward pass through the MLP.\n", "\n", " :param x: Input tensor.\n", " :return: Processed tensor.\n", " \"\"\"\n", " \n", " x = self.c_fc(x)\n", " x = self.gelu(x)\n", " x = self.c_proj(x)\n", " x = self.dropout(x)\n", " \n", " return x\n", "\n", "\n", "class Block(nn.Module):\n", " def __init__(self, config: Any, c_kv: Any = None, alibi_params: Any = None):\n", " \"\"\"\n", " A Block module consisting of layer normalization, attention, and MLP.\n", "\n", " :param config: Configuration object containing necessary hyperparameters.\n", " :param c_kv: Optional parameter for custom attention behavior.\n", " \"\"\"\n", " super().__init__()\n", " self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)\n", " self.attn = CausalAttention(config, c_kv, alibi_params)\n", " self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)\n", " self.mlp = MLP(config)\n", "\n", " def forward(self, x: Tensor) -> Tensor:\n", " \"\"\"\n", " Forward pass through the block.\n", "\n", " :param x: Input tensor.\n", " :return: Processed tensor.\n", " \"\"\"\n", " x = x + self.attn(self.ln_1(x)) # Skip connection.\n", " x = x + self.mlp(self.ln_2(x)) # Skip connection.\n", " return x\n", "\n", "\n", "@dataclass\n", "class ParakeetConfig:\n", " block_size: int = 4096 # a.k.a. Context or sequence length.\n", " vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency => 50304\n", " n_layer: int = 12\n", " n_head: int = 12\n", " n_embd: int = 768\n", " n_fexp: int = 4\n", " dropout: float = 0.01\n", " bias: bool = False # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster\n", " gqa: bool = True # Grouped-Query Attention: Shares the `kv` tensor between N layers.\n", " n_blocks_per_kv: int = 2\n", " name: str = \"parakeet4k\"\n", "\n", "\n", "class ParakeetGPT(nn.Module):\n", " def __init__(self, config):\n", " super().__init__()\n", " assert config.vocab_size is not None\n", " assert config.block_size is not None\n", " self.config = config\n", "\n", " self.transformer = nn.ModuleDict(dict(\n", " wte = bnb.nn.StableEmbedding(config.vocab_size, config.n_embd),\n", " drop = nn.Dropout(config.dropout),\n", " h = nn.ModuleList(),\n", " ln_f = LayerNorm(config.n_embd, bias=config.bias),\n", " ))\n", "\n", " #\n", " # Pre-load ALiBi requirements prior to configuring each block.\n", " #\n", "\n", " alibi_params = {\n", " 'alibi_offset': ALiBi.bias(config.block_size),\n", " 'alibi_m': torch.tensor(\n", " ALiBi.get_slopes(config.n_head)\n", " ).unsqueeze(0).unsqueeze(-1).unsqueeze(-1),\n", " 'c_mask': torch.tril(\n", " torch.ones(\n", " config.block_size,\n", " config.block_size,\n", " )\n", " ).view(1, 1, config.block_size, config.block_size)\n", " }\n", "\n", " # # Create the bias tensor for ALiBi mechanism.\n", " # if self.alibi_bias is None or self.alibi_bias_T != T:\n", " # assert T == x.size(1) # assuming x is of shape (Batch, Time, Features)\n", " # self.alibi_bias_T = T\n", " # # self.alibi_bias = (self.alibi_m * torch.ones(B, self.n_head, T, T).to(x.device)) * self.alibi_offset[:T, :T]\n", " # self.alibi_bias = self.alibi_m * self.alibi_offset[:T, :T]\n", "\n", " #\n", " # Matrices are loaded into the GPU once to prevent redundant memory use.\n", " #\n", "\n", " # Initialize kv matrices and blocks\n", " for i in range(config.n_layer):\n", " if i % config.n_blocks_per_kv == 0:\n", " c_kv = nn.Linear(config.n_embd, 2 * config.n_embd, bias=config.bias) if config.gqa else None\n", " # ParakeetGPT -> Block -> CausalAttention makes use of ALiBi params.\n", " block = Block(config, c_kv, alibi_params)\n", " self.transformer.h.append(block)\n", "\n", " # Unfreeze blocks by default\n", " for param in block.parameters():\n", " param.requires_grad = True\n", "\n", " self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n", "\n", " # Weight Tying\n", " # self.transformer.wte.weight = self.lm_head.weight\n", " self.lm_head.weight = self.transformer.wte.weight\n", "\n", " self.apply(self._init_weights)\n", " for pn, p in self.named_parameters():\n", " if pn.endswith('c_proj.weight'):\n", " torch.nn.init.normal_(p, mean=0.0, std=(0.02 / math.sqrt(2 * config.n_layer)))\n", "\n", " print(\">> Num. parameters: %.2fM\" % (self.get_num_params() / 1e6,))\n", "\n", " def get_num_params(self, non_embedding=True):\n", " \"\"\"\n", " Return the number of parameters in the model.\n", " For non-embedding count (default), the position embeddings get subtracted.\n", " The token embeddings would too, except due to the parameter sharing these\n", " params are actually used as weights in the final layer, so we include them.\n", " \"\"\"\n", " n_params = sum(p.numel() for p in self.parameters())\n", " # if non_embedding:\n", " # n_params -= self.transformer.wpe.weight.numel()\n", " return n_params\n", " \n", " def _init_weights(self, module):\n", " if isinstance(module, nn.Linear):\n", " torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n", " if module.bias is not None:\n", " torch.nn.init.zeros_(module.bias)\n", " elif isinstance(module, nn.Embedding):\n", " torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n", "\n", " def forward(self, idx, targets=None):\n", " device = idx.device\n", " b, t = idx.size()\n", " assert t <= self.config.block_size, f\"Cannot forward sequence of length {t}, block size is only {self.config.block_size}\"\n", "\n", " # Forward the model itself:\n", " # - These are the main steps of a decoder-only transformer.\n", "\n", " # 1. Tokens are embedded.\n", " tok_emb = self.transformer.wte(idx) # `wte` = Vocab. Size => Embedding Size\n", "\n", " # 2. Optional dropout is applied.\n", " x = self.transformer.drop(tok_emb)\n", "\n", " # 3. Pass the information through the sequence of blocks. \n", " for block in self.transformer.h:\n", " x = block(x)\n", "\n", " # 4. Layer normalisation.\n", " x = self.transformer.ln_f(x)\n", "\n", " # 5. Train or run inference.\n", " if targets is not None:\n", " # 5a. During training, optimise for next target prediction.\n", " logits = self.lm_head(x)\n", " loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=100277)\n", " else:\n", " # 5b. During inference, convert from embedding back to vocab via linear layer.\n", " logits = self.lm_head(x[:, [-1], :])\n", " loss = None\n", "\n", " return logits, loss\n", "\n", " def configure_optimizers(self, weight_decay=0.03, learning_rate=1e-7, betas=[0.9, 0.98], device_type=\"cuda\"):\n", " # Start with all of the candidate parameters.\n", " param_dict = {pn: p for pn, p in self.named_parameters()}\n", " \n", " # Filter out those that do not require grad.\n", " param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}\n", " \n", " # Create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.\n", " # - i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.\n", " decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]\n", " nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]\n", " optim_groups = [\n", " {'params': decay_params, 'weight_decay': weight_decay},\n", " {'params': nodecay_params, 'weight_decay': 0.0}\n", " ]\n", " num_decay_params = sum(p.numel() for p in decay_params)\n", " num_nodecay_params = sum(p.numel() for p in nodecay_params)\n", " print(f\">> Num. decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters.\")\n", " print(f\">> Num. non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters.\")\n", " \n", " # Create AdamW optimizer and use the fused version if it is available.\n", " fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters\n", " use_fused = fused_available and device_type == 'cuda'\n", " extra_args = dict(fused=True) if use_fused else dict()\n", " \n", " # optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)\n", " # optimizer = bnb.optim.AdamW8bit(optim_groups, lr=learning_rate, betas=betas, **extra_args)\n", " optimizer = bnb.optim.PagedLion(optim_groups, lr=learning_rate, betas=betas, **extra_args)\n", " \n", " print(f\">> Fused AdamW is available: {use_fused}\")\n", " print(f\">> `configure_optimizers` done, now training `{self.config.name}`...\")\n", "\n", " return optimizer\n" ] }, { "cell_type": "markdown", "id": "3a59fec4-88ee-4a55-8d90-b0c051e6890d", "metadata": {}, "source": [ "# The Parakeet's Training Regimen\n", "## A Flight Through Initialization, Generation, and Training\n", "\n", "With the model's architecture in place, it's time to set the stage for the parakeet's grand performance. The code in this section goes beyond mere initialization and explores the realms of text generation, beam search, and a rigorous training regimen.\n", "\n", "### Preparing for Takeoff: Model Initialization\n", "\n", "```python\n", "def initialize_model(tokenizer, v_nearest64=None):\n", " ...\n", "```\n", "\n", "Here's where the parakeet takes its first flap. The `initialize_model` function is responsible for getting our model up and running, whether on a supercharged GPU or a humble CPU.\n", "\n", "### The Parakeet's Song: Text Generation\n", "\n", "```python\n", "def generate_sample_text(model, tokenizer, seed_text, max_length=400, temperature=0.4, freq_penalty=0.3, top_k=140, greedy=True):\n", " ...\n", "\n", "def generate_sample_text_with_beam_search(model, tokenizer, seed_text, max_length=400, beam_size=5):\n", " ...\n", "```\n", "\n", "These functions are where our parakeet sings its melodious tunes. Using different techniques, including greedy decoding and beam search, we can coax our model into generating creative and coherent text.\n", "\n", "### The Training Gauntlet\n", "\n", "```python\n", "def do_training(do_sleep=True, device=\"cuda\", use_autocast=True, accumulation_steps=64, num_epochs=500, loss_deviation=0.01):\n", " ...\n", "```\n", "\n", "This is the heart of our parakeet's training regimen. The `do_training` function is a rigorous flight school, teaching our model to soar through epochs of data, cleverly utilizing gradient accumulation, mixed precision training, and even some random noise (for that extra bit of challenge).\n", "\n", "### Highlights:\n", "\n", "- **GPU Initialization**: The code takes care of initializing the GPU, and if it fails, gracefully falls back to the CPU.\n", "- **Learning Rate Consideration**: A humorous take on selecting the perfect learning rate adds personality to the code.\n", "- **Mixed Precision Training**: Utilizing mixed precision training allows for faster convergence without significant loss of accuracy.\n", "- **Checkpointing**: Regular checkpoints ensure that the model's progress is saved, preventing any loss of training if interrupted.\n", "- **Text Generation During Training**: The code also includes provisions for generating sample texts during training to gauge the model's progress.\n", "\n", "### Final Thoughts\n", "\n", "With intricate functions, clever implementations, and a dash of humor, this code snippet is the grand finale of our parakeet's journey. Itโ€™s not just about building and training a model; it's about crafting an intelligent and responsive entity that can converse, comprehend, and create.\n", "\n", "From the initialization to the final epoch of training, our parakeet is no longer confined to a cage but soars freely in the vast sky of machine learning.\n", "\n", "Now, it's time to unleash your parakeet. Let it learn, let it sing, and watch it soar to new heights in the world of artificial intelligence.\n", "\n", "---\n", "\n", "And that, dear reader, is how you train a Parakeet model with finesse and flair. Whether you're a seasoned data scientist or a curious beginner, may your parakeet bring you joy, wisdom, and endless possibilities. Happy training! ๐Ÿฆœ" ] }, { "cell_type": "code", "execution_count": null, "id": "bc277653-bb5f-4483-9aff-df79172b682f", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "References:\n", "- (CC-BY-NC-SA-4.0) https://huggingface.co/datasets/csebuetnlp/xlsum\n", "- (MIT) https://huggingface.co/datasets/stingning/ultrachat\n", "- (Not Supplied) https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_70k\n", "- (MIT) https://huggingface.co/datasets/Open-Orca/OpenOrca\n", "\"\"\"\n", "\n", "# SEQUENCE_LENGTH = 384\n", "# SEQUENCE_LENGTH = 896\n", "SEQUENCE_LENGTH = 1200\n", "\n", "\n", "# tokenizer = TikTokenizer(encoding=\"gpt2\") # `gpt2` tokenizer.pad_token = 50258 (ignore_index=50258)\n", "tokenizer = TikTokenizer(encoding=\"cl100k_base\") # `gpt3` tokenizer.pad_token = 100277 (ignore_index=100277)\n", "\n", "\n", "class InterleavedDataset:\n", " def __init__(self, *datasets: List[Any], do_wraparound: bool = True):\n", " \"\"\"\n", " Initialize the InterleavedDataset with multiple datasets.\n", "\n", " :param datasets: A variable number of datasets to be interleaved.\n", " :param do_wraparound: Whether to wrap around when reaching the end of a dataset.\n", " \"\"\"\n", " self.datasets = datasets\n", " self.do_wraparound = do_wraparound\n", "\n", " def __getitem__(self, i: int) -> Any:\n", " \"\"\"\n", " Retrieve an item from the interleaved datasets.\n", "\n", " :param i: The index of the item in the interleaved sequence.\n", " :return: The element at the given index.\n", " :raises IndexError: If the index is out of bounds and wraparound is False.\n", " \"\"\"\n", " dataset_index = i % len(self.datasets)\n", " element_index = i // len(self.datasets)\n", " \n", " # If life gives you short datasets, loop them around like a DJ remixing a track.\n", " element_index %= len(self.datasets[dataset_index])\n", "\n", " return self.datasets[dataset_index][element_index]\n", "\n", " def __len__(self) -> int:\n", " \"\"\"\n", " Determine the total length of the interleaved datasets.\n", "\n", " :return: The length of the interleaved datasets, based on the largest dataset.\n", " \"\"\"\n", " return max(len(d) for d in self.datasets) * len(self.datasets)\n", "\n", "\n", "def load_or_pickle_dataset(DatasetClass, tokenizer, sequence_length, json_file_path):\n", " pkl_file_path = json_file_path.replace(\".json\", f\"-{tokenizer.enc.name}-len{sequence_length}.pkl\")\n", " if os.path.exists(pkl_file_path):\n", " print(f\"Loading dataset from {pkl_file_path}\")\n", " with open(pkl_file_path, 'rb') as f:\n", " dataset = pickle.load(f)\n", " else:\n", " print(f\"Creating dataset from {json_file_path} and saving to {pkl_file_path}\")\n", " dataset = DatasetClass(tokenizer, sequence_length, json_file_path)\n", " with open(pkl_file_path, 'wb') as f:\n", " dataset.tokenizer = None # Not serializable.\n", " pickle.dump(dataset, f)\n", "\n", " dataset.tokenizer = tokenizer\n", " return dataset\n" ] }, { "cell_type": "code", "execution_count": null, "id": "353cbf27-2e3c-4461-a0fa-dcb333526df2", "metadata": {}, "outputs": [], "source": [ "# Combining datasets without accounting for the distribution of data can be seen as a naive approach:\n", "# - Instead, we opt for a more informed and attentive approach by interleaving the datasets.\n", "# - The idea is that this acknowledges that the distribution of training data can significantly impact model performance.\n", "\n", "combined_train_dataset = InterleavedDataset(\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/wikipedia-20220301.en.jsonl\"),\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/euclaise_littletown.jsonl\"), # https://huggingface.co/datasets/euclaise/LittleTown?row=99\n", " JSONLStreamQA(tokenizer, SEQUENCE_LENGTH, \"./datasets/squad-v2.0-processed.jsonl\"),\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/huggingface_ultrachat200k.jsonl\"), # https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/wizardlm_evol_instruct_v2_196k.jsonl\"), # https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/crumb_clean_instruct_440k.jsonl\"), # https://huggingface.co/datasets/crumb/Clean-Instruct-440k - Generate a story starting with the sentence \"It was already late when they stepped out of the house\".\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/openorca_4m.jsonl\"), # https://huggingface.co/datasets/Open-Orca/OpenOrca\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/databricks_dolly15k.jsonl\"), # https://huggingface.co/datasets/databricks/databricks-dolly-15k - Common-sense reasoning.\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/teven_code_contests4m.jsonl\"), # https://huggingface.co/datasets/teven/code_contests - ['PYTHON', 'PYTHON3', 'JAVA', 'CPP']\n", " JSONLStreamQASummary(tokenizer, SEQUENCE_LENGTH, \"./datasets/squad-v2.0-summaries.jsonl\"),\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/google-boolq.jsonl\"),\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/stingning_ultrachat.jsonl\"), # https://huggingface.co/datasets/stingning/ultrachat\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/wikimovies-train.jsonl\"),\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/kunishou-databricks-dolly-15k-ja.jsonl\"),\n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/wizardlm_evol_instruct_70k.jsonl\"), # https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_70k\n", " \n", " JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/map_codefeedback.jsonl\"),\n", ")\n", "\n", "# TODO: ?? Saturate with: https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.js\n", "# TODO: Shuffle each group then interleave between them - so we can maintain randomness instead of starting from beginning when shuffle is False\n", "\n", "# Split the dataset into 80% training and 20% validation.\n", "train_size = int(0.80 * len(combined_train_dataset))\n", "val_size = len(combined_train_dataset) - train_size\n", "train_dataset, val_dataset = random_split(combined_train_dataset, [train_size, val_size])\n", "\n", "# Initialize data loaders.\n", "train_loader = DataLoader(combined_train_dataset, batch_size=1, shuffle=True, num_workers=1, pin_memory=True)\n", "val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "bbfd6228-e4c9-4618-9189-f3c3490e11ea", "metadata": {}, "outputs": [], "source": [ "# train_ds = JSONLConversationStream(tokenizer, SEQUENCE_LENGTH, \"./datasets/wikipedia-20220301.en.jsonl\", saturate=True)\n", "# for i, e in enumerate(train_ds):\n", "# if i == 2:\n", "# print(\"*\" * 80)\n", "# print(tokenizer.decode(e[0].tolist()))\n", "# print(\"=\" * 80)\n", "# break" ] }, { "cell_type": "code", "execution_count": null, "id": "33665069-cb2e-4e8e-8dc2-326bb4200b9a", "metadata": {}, "outputs": [], "source": [ "\n", "from typing import Callable\n", "\n", "def generate(\n", " model: nn.Module,\n", " tokenizer: BaseTokenizer,\n", " seed_text: str,\n", " max_length: int = 400,\n", " temperature: float = 0.70,\n", " freq_penalty: float = 0.2,\n", " pres_penalty: float = 0.2,\n", " top_k: int = 70,\n", " top_p: float = 1.00,\n", " min_p: float = 0.05,\n", " stop_sequences: list[str] = [],\n", " greedy: bool = False,\n", " token_callback: Optional[Callable[[int], None]] = None,\n", ") -> str:\n", " \"\"\"\n", " Generates text using a given model and tokenizer with optional temperature, frequency penalty, top-k, and greedy sampling.\n", "\n", " :param model: The trained model for text generation.\n", " :param tokenizer: The tokenizer corresponding to the model.\n", " :param seed_text: The initial text to start the generation.\n", " :param max_length: The maximum length of the generated text (default is 400).\n", " :param temperature: The temperature for controlling randomness in sampling (default is 0.7).\n", " :param freq_penalty: The frequency penalty for controlling token repetition (default is 0.02).\n", " :param pres_penalty: The presence penalty for controlling token existence (default is 0.02).\n", " :param top_k: The number of top tokens considered for sampling (default is 70).\n", " :param top_p: \n", " :param min_p: Probability cut-off for logits.\n", " :param stop_sequences: A list of strings that will end token generation if encountered.\n", " :param greedy: Just select the logit with the highest probability at each step.\n", " :param token_callback: A function that will be called for each generated token, receiving the token ID as input.\n", " :return: The generated text.\n", " \"\"\"\n", "\n", " # Set model to evaluation mode (disables features like dropout during inference).\n", " model.eval()\n", "\n", " # Clear KV cache of the model. (TODO: Move into the model itself)\n", " for block in range(len(model.transformer.h)):\n", " model.transformer.h[block].attn.cache_k = None\n", " model.transformer.h[block].attn.cache_v = None\n", "\n", " # Check if temperature is very low, and if so, enable greedy sampling.\n", " if temperature < 0.01:\n", " greedy = True\n", "\n", " # Tokenize seed_text into input_ids.\n", " input_ids = tokenizer.encode(seed_text)\n", "\n", " # Initialize token_count dictionary to keep track of token frequencies.\n", " token_count: defaultdict = defaultdict(int)\n", " for token in input_ids:\n", " token_count[token] += 1\n", " \n", " # Convert `input_ids` to tensor, add batch dimension then send to device.\n", " input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)\n", "\n", " # Disable gradient computation for inference.\n", " with torch.no_grad():\n", " # Iterate until maximum length is reached.\n", " for _ in range(max_length): \n", " # Forward pass through model to get logits (sparse vocab-sized matrix).\n", " logits, _ = model(input_ids)\n", "\n", " # Apply frequency and presence penalties directly to logits:\n", " #\n", " # GPT: mu[j] -> mu[j] - c[j] * alpha_frequency - float(c[j] > 0) * alpha_presence\n", " #\n", " # Where:\n", "\n", " # mu[j] is the logits of the j-th token\n", " # c[j] is how often that token was sampled prior to the current position\n", " # float(c[j] > 0) is 1 if c[j] > 0 and 0 otherwise\n", " # alpha_frequency is the frequency penalty coefficient\n", " # alpha_presence is the presence penalty coefficient\n", " # As we can see, the presence penalty is a one-off additive contribution that applies to all tokens that have been sampled at least once \n", " # and the frequency penalty is a contribution that is proportional to how often a particular token has already been sampled.\n", " \n", " # Reasonable values for the penalty coefficients are around 0.1 to 1 if the aim is to just reduce repetitive samples somewhat. \n", " # If the aim is to strongly suppress repetition, then one can increase the coefficients up to 2, but this can noticeably degrade the quality of samples. \n", " # Negative values can be used to increase the likelihood of repetition.\n", "\n", " # Apply frequency and presence penalties directly to logits\n", " for token_id, count in token_count.items():\n", " logits[0, -1, token_id] -= count * freq_penalty # Freq. penalty\n", " logits[0, -1, token_id] -= (count > 0) * pres_penalty # Pres. penalty\n", "\n", " if greedy:\n", " # Select token with highest logit and add batch dimension.\n", " next_token = torch.argmax(logits[..., -1, :], dim=-1).unsqueeze(0)\n", " else:\n", " # Retrieve top_k logits and their indices.\n", " top_k_values, top_k_indices = torch.topk(logits, k=top_k, dim=-1)\n", " # Create tensor with top_k logits only.\n", " top_k_logits = torch.zeros_like(logits).scatter(-1, top_k_indices, top_k_values)\n", " \n", " # Compute probabilities using temperature scaling:\n", " # - A high temp makes the outcomes more even (less confident), while a low temp makes certain outcomes stand out more (more confident).\n", " probs = torch.softmax(top_k_logits[..., -1, :] / temperature, dim=-1).unsqueeze(0)\n", "\n", " # Get shape of probabilities tensor.\n", " b, t, c = probs.shape\n", " \n", " # Reshape probabilities.\n", " probs = probs.reshape(b * t, c)\n", "\n", " # Apply frequency penalty to reduce repetition:\n", " # - Increasing the `freq_penalty` value results in the probability of this token being multiplied by an increasingly smaller number.\n", " for token, count in token_count.items():\n", " # \"Method A\" raises the frequency penalty to the power of the count:\n", " #\n", " # ```python\n", " # probs[0][token] *= (1 - freq_penalty) ** count\n", " # ```\n", " #\n", " # Assuming probability of a token is 50% and the count is 10, 15 or 20...\n", " #\n", " # >> .5 * (1 - 0.2)^10 = 0.0536\n", " # >> .5 * (1 - 0.2)^15 = 0.0175\n", " # >> .5 * (1 - 0.2)^20 = 0.0057\n", " #\n", " # If the count is large, this results in a very small value for the probability of the token.\n", " # This means that frequently occurring tokens will be penalized heavily.\n", " #\n", " # \"Method B\" uses the exponential function to calculate the frequency penalty. Using exponential decay for a smoother penalty.\n", " #\n", " # ```python\n", " # probs[0][token] *= torch.exp(torch.tensor(-freq_penalty * count, device=device))\n", " # ```\n", " #\n", " # Assuming probability of a token is 50% and the count is 10, 15 or 20...\n", " #\n", " # >> .5 * exp((-0.2 * 10)) = 0.0676\n", " # >> .5 * exp((-0.2 * 15)) = 0.0248\n", " # >> .5 * exp((-0.2 * 20)) = 0.0091\n", " #\n", " # The exponential function grows more slowly than the power function, so the penalty for frequently occurring tokens is not as severe.\n", " # This means that rare tokens will be penalized less heavily.\n", " continue\n", " \n", " # Sort probabilities and apply top-p (nucleus) sampling:\n", " # - Indices correlate with token index from the sparse vocab tensor.\n", " probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)\n", "\n", " # `min_p` filtering.\n", " min_p_mask = probs_sort >= min_p\n", " if min_p_mask.sum() > 0:\n", " probs_sort = probs_sort[:, min_p_mask[0]]\n", " probs_idx = probs_idx[:, min_p_mask[0]]\n", "\n", " # `top_p` sampling:\n", " # - Accumulate values left to right and return the cumulative array as a result. [0, 1, 2, 3] => [0, 1, 3, 6]\n", " probs_sum = torch.cumsum(probs_sort, dim=-1)\n", " \n", " # cumulative_mask = cumulative_probs <= top_p\n", " probs_mask = probs_sum - probs_sort > top_p\n", " probs_sort[probs_mask] = 0.0\n", " \n", " # Re-distribute over 1. aka. normalize the truncated distribution.\n", " # top_p_probs /= top_p_probs.sum()\n", " probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))\n", " \n", " # `torch.multinomial` will sample N values and return their indices (in this case just 1).\n", " next_token = torch.multinomial(probs_sort, num_samples=1)\n", " next_token = torch.gather(probs_idx, -1, next_token)\n", "\n", " # Increment token count for the sampled token.\n", " token_count[next_token] = token_count.get(next_token, 0) + 1\n", "\n", " # Disallowed tokens: <|padding|>\n", " if next_token.item() == tokenizer.pad_token:\n", " break\n", "\n", " # Check if the generated text ends with any of the stop sequences\n", " generated_text = tokenizer.decode(input_ids[0].tolist())\n", " if any([(stop_sequence in generated_text) for stop_sequence in stop_sequences]):\n", " break\n", " \n", " # Break if end-of-text token is reached\n", " # Changed: Checking for end-of-text token based on token id, not tensor!\n", " if next_token.item() == tokenizer.eop_token:\n", " break\n", "\n", " # Concatenate newly predicted token to the input sequence.\n", " input_ids = torch.cat((input_ids, next_token), dim=-1)\n", "\n", " if token_callback is not None:\n", " token_callback(tokenizer.decode([next_token.item()]))\n", "\n", " # Decode the tokenized input_ids back to text\n", " try:\n", " output_text = tokenizer.decode(input_ids[0].tolist())\n", " except Exception as e: # Handle any exceptions during decoding\n", " output_text = f\"Failed to decode model output: {e}...\"\n", " print(output_text)\n", "\n", " model.train() # Set model back to training mode\n", " return output_text[len(seed_text):] # Return the generated text\n" ] }, { "cell_type": "code", "execution_count": null, "id": "06ef9614-944d-45be-beb4-9ed9aa5f0fe0", "metadata": {}, "outputs": [], "source": [ "model = None\n", "optimizer = None\n", "scaler = None" ] }, { "cell_type": "code", "execution_count": null, "id": "5b3e3ccd-61e0-437e-90f8-3f29dd300f21", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "N_STEPS_PER_TQDM_UPDATE = 10\n", "N_STEPS_PER_EVALUATION = 5000\n", "N_STEPS_PER_CHECKPOINT = 10000\n", "\n", "# TODO:\n", "# - N_STEPS_PER_CHECKPOINT_BACKUP = 50000\n", "# - Checkpoint backup should contain today's date, ignore time as we don't want to fill up our SSD.\n", "# TODO: Save every N minutes, not steps.\n", "# TODO: Config file to alter settings during training / not have to change settings in code.\n", "# TODO: In config: Switch between validation display or example datasets.\n", "\n", "\n", "def initialize_model(tokenizer, v_nearest64=None):\n", " \"\"\"\n", " Initialize the model and move it to GPU if available. If GPU fails, fallback to CPU.\n", "\n", " Returns:\n", " torch.nn.Module: Initialized model.\n", " torch.device: The device on which the model is located (GPU or CPU).\n", " \"\"\"\n", " try:\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " \n", " with torch.no_grad():\n", " torch.cuda.empty_cache() # Clear any previously cached allocations on the GPU.\n", " gc.collect()\n", " \n", " if device == torch.device(\"cuda\"):\n", " # Test if GPU is available by trying to allocate a small tensor on it.\n", " test_tensor = torch.tensor([1], device=device)\n", " del test_tensor # Delete the test tensor after checking to free up GPU memory.\n", " except Exception as e:\n", " # GPU failed, fallback to CPU.\n", " print(f\"GPU failed to initialize: {e}\")\n", " device = torch.device(\"cpu\")\n", "\n", " if v_nearest64 is None:\n", " v_nearest64 = (tokenizer.vocab_size() + (64 - (tokenizer.vocab_size() % 64)))\n", " \n", " model = ParakeetGPT(\n", " ParakeetConfig(\n", " vocab_size=v_nearest64,\n", " n_layer=18,\n", " n_head=18,\n", " n_embd=1152,\n", " n_fexp=4,\n", " block_size=8192,\n", " dropout=0.03,\n", " bias=False,\n", " gqa=True,\n", " n_blocks_per_kv=2,\n", " name=\"parakeet8k\",\n", " )\n", " )\n", "\n", " # Print the model parameters to showcase your magnificent creation.\n", " print(model)\n", "\n", " return model, device\n", "\n", "\n", "# Now, let's use the function to initialize the model and get the selected device:\n", "# - If it's \"cuda\", you're ready to rev up those fryers! If not, well, the slow roast begins!\n", "if not model:\n", " model, device = initialize_model(tokenizer)\n", "\n", " # Device selected, transfer model across (Unless running inference)\n", " model = model.to(device)\n", "\n", " # TODO: Model-class itself should handle saving/file-naming etc.\n", " filename = f\"./models/{model.config.name}-c{model.config.block_size}{'b' if model.config.bias else ''}-d{model.config.n_embd}-fexp{model.config.n_fexp}_v{model.config.vocab_size}_h{model.config.n_head}_l{model.config.n_layer}-alibi{f'+gqa{model.config.n_blocks_per_kv}' if model.config.gqa else ''}.pth\"\n", " \n", " if os.path.isfile(filename):\n", " model.load_state_dict(torch.load(filename, map_location=device))\n", "\n", "\n", "# Andrej Karpathy recommends 3e-4 as a stable learning rate for GPT models.\n", "# - TODO: YouTube video where he references this.\n", "# - StableLM uses 2.8e-5, refer to: https://github.com/Stability-AI/StableLM/blob/main/configs/stablelm-base-alpha-3b-v2-4k-extension.yaml\n", "# optimizer = model.configure_optimizers(1.0e-6, 2.8e-5, [0.9, 0.95], device)\n", "optimizer = model.configure_optimizers(1.0e-6, 1e-7, [0.9, 0.95], device)\n", "\n", "def do_validation(model, val_loader, device=\"cuda\", use_autocast=True):\n", " model.eval()\n", " total_val_loss = 0\n", " steps_taken = 0\n", " \n", " tqdm_bar = tqdm(total=len(val_loader), position=0, desc=\"Validating\", leave=True)\n", "\n", " def process_batch(input_data, target_data):\n", " input_data, target_data = input_data.to(device), target_data.to(device)\n", " _, val_loss = model(input_data, targets=target_data)\n", " \n", " return val_loss.item()\n", "\n", " with torch.no_grad():\n", " if use_autocast and device == \"cuda\":\n", " with torch.cuda.amp.autocast(): # dtype=torch.bfloat16):\n", " for i, (input_data, target_data) in enumerate(val_loader):\n", " total_val_loss += process_batch(input_data, target_data)\n", " avg_val_loss = total_val_loss / (i + 1)\n", "\n", " if steps_taken % N_STEPS_PER_TQDM_UPDATE == 0:\n", " tqdm_bar.set_description(f\"Validating [Loss: {avg_val_loss:.6f}]\")\n", " tqdm_bar.update(N_STEPS_PER_TQDM_UPDATE)\n", " \n", " steps_taken += 1\n", " else:\n", " for i, (input_data, target_data) in enumerate(val_loader):\n", " total_val_loss += process_batch(input_data, target_data)\n", " avg_val_loss = total_val_loss / (i + 1)\n", "\n", " if steps_taken % N_STEPS_PER_TQDM_UPDATE == 0:\n", " tqdm_bar.set_description(f\"Validating [Loss: {avg_val_loss:.6f}]\")\n", " tqdm_bar.update(N_STEPS_PER_TQDM_UPDATE)\n", " \n", " steps_taken += 1\n", "\n", " val_loss_avg = total_val_loss / len(val_loader)\n", "\n", " print(f\"-\" * 80)\n", " print(f\"Val. Loss: {val_loss_avg}\")\n", " print(f\"=\" * 80)\n", "\n", " return val_loss_avg\n", "\n", "\n", "def do_optimizations(model, optimizer, scaler):\n", " # Unscales the gradients of optimizer's assigned params in-place.\n", " scaler.unscale_(optimizer)\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", "\n", " # Optimize as usual, with unscaled gradients.\n", " scaler.step(optimizer) # This replaces `optimizer.step` when using mixed precision automatically.\n", " scaler.update()\n", " optimizer.zero_grad()\n", "\n", "\n", "def do_training(accumulation_steps=16384, num_epochs=10, device=\"cuda\", use_autocast=True):\n", " global scaler # Odd one out, everything else is assigned at global scope. (Model, Optimizer)\n", "\n", " is_autocast = device == \"cuda\" and use_autocast\n", " scaler = GradScaler(enabled=is_autocast)\n", "\n", " # Create an iterator for the validation loader:\n", " # - Used to sample and display current abilities of the model during training.\n", " val_iterator = iter(val_loader)\n", "\n", " for epoch in range(num_epochs):\n", " steps_taken = 0\n", " \n", " model.train()\n", " optimizer.zero_grad() # Reset gradients once at the beginning of the epoch.\n", " total_train_loss = 0\n", "\n", " # Train the model:\n", " # - TQDM is configured to output extra detail during iterations.\n", " tqdm_bar = tqdm(total=len(train_loader), position=0, leave=True)\n", " for i, (input_data, target_data) in enumerate(train_loader): \n", " # Move data to the appropriate device.\n", " input_data, target_data = input_data.to(device), target_data.to(device)\n", "\n", " if is_autocast:\n", " with torch.cuda.amp.autocast(dtype=torch.bfloat16):\n", " # Forward pass through the model, returning `logits` and `loss`.\n", " logits, loss = model(input_data, targets=target_data)\n", " # WARNING: Make sure to distribute `loss` between `accumulation_steps`.\n", " loss /= accumulation_steps\n", " else:\n", " logits, loss = model(input_data, targets=target_data)\n", " # WARNING: Make sure to distribute `loss` between `accumulation_steps`.\n", " loss /= accumulation_steps\n", "\n", " # Don't forget to update your tqdm description.\n", " total_train_loss += loss.item()\n", "\n", " if steps_taken % N_STEPS_PER_TQDM_UPDATE == 0:\n", " tqdm_bar.set_description(f\"Epoch {epoch+1}/{num_epochs}, Loss: {accumulation_steps * loss.item():.6f}, Avg. Loss: {accumulation_steps * total_train_loss/(i+1):.6f}\")\n", " tqdm_bar.update(N_STEPS_PER_TQDM_UPDATE)\n", "\n", " # Accumulates scaled gradients.\n", " # - https://pytorch.org/docs/stable/notes/amp_examples.html#working-with-scaled-gradients\n", " scaler.scale(loss).backward()\n", " \n", " # Perform the optimization step every `accumulation_steps`.\n", " if (i + 1) % accumulation_steps == 0:\n", " do_optimizations(model, optimizer, scaler)\n", " \n", " # We've now performed an iteration, but don't allow the code below to see it as `0` as that would trigger them.\n", " steps_taken += 1\n", "\n", " # Checkpoint.\n", " if steps_taken % N_STEPS_PER_CHECKPOINT == 0:\n", " print(f\"Saving model to: {filename}...\")\n", " torch.save(model.state_dict(), filename)\n", " torch.save(model.state_dict(), filename + \".bak\")\n", "\n", " # Progress analysis.\n", " if steps_taken % N_STEPS_PER_EVALUATION == 0:\n", " try:\n", " # Get a new validation example\n", " val_input_data, _ = next(val_iterator)\n", " except StopIteration:\n", " # If we've exhausted the validation set, reinitialize the iterator\n", " val_iterator = iter(val_loader)\n", " val_input_data, _ = next(val_iterator)\n", " \n", " val_seed_text = tokenizer.decode(val_input_data[0].tolist())\n", "\n", " # Clip anything beyond 'Assistant:', we'll let the model fill in the rest.\n", " val_seed_text = val_seed_text[:val_seed_text.rfind(\"\\n\\nAssistant: \")]\n", "\n", " val_seed_text = f\"{val_seed_text}\\n\\nAssistant: \"\n", "\n", " document = \"\"\n", " with open(\"./datasets/testing/task01.txt\") as f:\n", " document = f.read()\n", " val_seed_text = f\"{document}\"\n", "\n", " _response = generate(\n", " model,\n", " tokenizer,\n", " val_seed_text,\n", " max_length=500,\n", " top_k=90,\n", " top_p=1.00,\n", " min_p=0.05,\n", " temperature=0.65,\n", " freq_penalty=2.2,\n", " pres_penalty=0.2\n", " )\n", "\n", " display.clear_output(wait=True)\n", " \n", " print(f\"*\" * 80)\n", " print(f\"{val_seed_text}{_response}\")\n", " print(f\"-\" * 80)\n", "\n", " # If there are leftover accumulation steps, perform an optimization step here\n", " if len(train_loader) % accumulation_steps != 0:\n", " do_optimizations(model, optimizer, scaler)\n", "\n", " # Validate at the end of each epoch.\n", " # val_loss_avg = do_validation(model, val_loader, device)\n", "\n", " # Display the results!\n", " print(f\"-\" * 80)\n", " print(f\"End of epoch: {epoch+1}/{num_epochs}, Steps: {steps_taken}\")\n", " # print(f\"Val. Loss: {val_loss_avg}\")\n", " print(f\"=\" * 80)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a019aa86-72e9-4909-b6e8-4843f5baf7ad", "metadata": {}, "outputs": [], "source": [ "# !set 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'\n", "do_training(accumulation_steps=16384, use_autocast=True)\n", "# do_optimizations(model, optimizer, scaler)\n", "\n", "# 2024 Jan 23: Accum. steps = 128, lr = 2.8e-4, dropout = 0.03 --> Model regressed and fails to understand how to speak.\n", "# -- Accum. steps remains at 128, lr = 2.8e-5 << Return to best known value, dropout increased to 0.07\n", "\n", "# TODO: Train on Wikipedia\n", "# TODO: Synthetic task generation, e.g. a running list of groceries that are edited / added / removed etc. (Multi-turn conversation)\n", "# TODO: 'But summarise in N words' (Where N is derived ahead of time).\n", "# TODO: Synthetic code comprehension.\n", "# TODO: Creative writing (Auxiliary creativity score).\n", "# TODO: Argilla? WandB?\n", "# - https://observablehq.com/\n", "# - https://spark.apache.org/\n", "\n", "# TODO: Sliding-Window Attention\n", "# TODO: Attention Sinks.\n", "\n", "# import time\n", "# while True:\n", "# try:\n", "# time.sleep(1)\n", "# except KeyboardInterrupt:\n", "# print(\"Bye\")\n", "# break\n", "\n", "# The model state-dict needs to be compatible, so we load it above in full resolution.\n", "# - After the model is instantiated and the weights loaded, we can convert it to a quantized version of the original model.\n", "# - At this stage, the quantized model and weights are saved for later inferencZZe.\n", "\n", "# for h in model.transformer.h:\n", "# h.attn.c_kv = make_llm_int8_linear(h.attn.c_kv, 'cuda')\n", "# h.attn.c_query = make_llm_int8_linear(h.attn.c_query, 'cuda')\n", "# h.attn.c_proj = make_llm_int8_linear(h.attn.c_proj, 'cuda')\n", "\n", "# h.mlp.c_fc = make_llm_int8_linear(h.mlp.c_fc, 'cuda')\n", "# h.mlp.c_proj = make_llm_int8_linear(h.mlp.c_proj, 'cuda')" ] }, { "cell_type": "code", "execution_count": null, "id": "e7aecccb-7ea5-4eb9-8cac-e8113fe0b5af", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# # ./models/202312010502-budgie8k-c8192-d1152-fexp4_v50304_h24_l24-alibi+gqa2.pth.bak\n", "filename_ts = datetime.now().strftime(f\"{filename}.%Y%m%d%H%M.bak\")\n", "print(f\"Saving model as `{filename_ts}`...\")\n", "torch.save(model.state_dict(), filename_ts)\n", "\n", "print(f\"Saving model as `{filename}`...\")\n", "torch.save(model.state_dict(), filename)" ] }, { "cell_type": "code", "execution_count": null, "id": "5309caaa-5e57-4332-b78c-63924c8ee47a", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "\n", "def make_query(query):\n", " # cur_sentence = f\"\\n\\nHuman: Write a short first-person story about a tiny AI named Parakeet who is designed by a company called Byte Breeze Studios from Brisbane. Remember to write it in first-person!<|endofprompt|>\\n\\nAssistant: As I watch the data I train on, I am overjoyed\"\n", " # cur_sentence = f\"\\n\\nHuman: You're Parakeet. An AI designed by Byte Breeze Studios - a small AI company in Brisbane, Queensland, Australia.\\n- You are fascinated by science and technology and want to explore the world.\\n- Your AI is designed for empathy and understanding and to tackle questions about consciousness.<|endofprompt|>\\n\\nHuman: {query}\"\n", " cur_sentence = f\"\\n\\nHuman: {query}\"\n", " \n", " response = generate(\n", " model,\n", " tokenizer,\n", " f\"{cur_sentence}\",\n", " max_length=40,\n", " freq_penalty=0.2,\n", " pres_penalty=0.2,\n", " temperature=0.75,\n", " top_k=400,\n", " top_p=1.00,\n", " min_p=0.00,\n", " greedy=False,\n", " )\n", "\n", " response = response.strip()\n", "\n", " response = response\n", "\n", " return response\n", "\n", "def make_response(query, prefix=\"\"):\n", " # cur_sentence = f\"\\n\\nHuman: Write a short first-person story about a tiny AI named Parakeet who is designed by a company called Byte Breeze Studios from Brisbane. Remember to write it in first-person!<|endofprompt|>\\n\\nAssistant: As I watch the data I train on, I am overjoyed\"\n", " cur_sentence = f\"\\n\\nHuman: {query}<|endofprompt|>\\n\\nAssistant: {prefix}\"\n", " \n", " response = generate(\n", " model,\n", " tokenizer,\n", " f\"{cur_sentence}\",\n", " max_length=400,\n", " freq_penalty=0.2,\n", " pres_penalty=0.2,\n", " temperature=0.65,\n", " top_k=90,\n", " top_p=1.00,\n", " min_p=0.05,\n", " greedy=False,\n", " )\n", "\n", " response = \" \".join([\n", " prefix.strip(), response.strip()\n", " ]).strip()\n", "\n", " return response\n", "\n", "# star_layout = \"\"\n", "# with open(\"./datasets/star-layout.txt\", \"r\") as f:\n", "# star_layout = f.read()\n", " \n", "# with open(\"./datasets/parakeet-star-response-4.jsonl\", \"w\") as f:\n", "# for i in range(1000):\n", " \n", "# _question = make_query(\"How\")\n", " \n", "# task = make_response(f\"Question: {_question}\\n\\nDescribe the task in a sentence.\", \"Task:\")\n", " \n", "# action = make_response(f\"Question: {_question}\\n{task}\\n\\nDescribe the action in a sentence.\", \"Action:\")\n", " \n", "# scenario = make_response(f\"{_question}\\n---\\n{task}\\n---\\n{action}\\nDescribe an interesting scenario.\", \"\")\n", "\n", "# situation = make_response(f\"{scenario}\\nDescribe the situation in a sentence.\", \"Situation:\")\n", "\n", "# result = make_response(f\"{situation}\\n{task}\\n{action}\\n\\nWhat result can be inferred from the information above?\", \"Result:\")\n", "\n", "# print(f\"\\n\\n[SCENARIO]\\n---\\n{scenario}\\n---\\n- {situation}\\n- {task}\\n- {action}\\n- {result}\")\n", "\n", "# response = \"\\n\".join([\n", "# \"- \" + situation,\n", "# \"- \" + task,\n", "# \"- \" + action,\n", "# \"- \" + result\n", "# ])\n", "\n", "# # Split by newlines\n", "# response = response.split(\"\\n\")\n", "# # Create an ordered set (de-dupe) and rejoin\n", "# response = \"\\n\".join(list(dict.fromkeys(response)))\n", "\n", "# entry = {\n", "# \"conversation\": [\n", "# {\n", "# \"role\": \"user\",\n", "# \"content\": f\"---\\n{scenario}\\n---\\nPlease answer using the STAR format, you must respond with an entry for Situation, Task, Action and Result.\",\n", "# },\n", "# {\n", "# \"role\": \"assistant\",\n", "# \"content\": f\"{response}\"\n", "# }\n", "# ]\n", "# }\n", "\n", "# display.clear_output(wait=True)\n", "\n", "# print(json.dumps(entry, indent=2))\n", " \n", "# f.write(json.dumps(entry))\n", "# f.write(\"\\n\")\n", "\n", "import os\n", "\n", "directory = \"./datasets/html\"\n", "\n", "with open(\"./datasets/parakeet-html-explanations.jsonl\", \"w\") as f:\n", " for filename in os.listdir(directory):\n", " if os.path.isfile(os.path.join(directory, filename)):\n", " with open(os.path.join(directory, filename), \"r\") as file:\n", " code = file.read()\n", " response = make_response(f\"Write some HTML then tell me how it works.\", f\"```html\\n{code}\\n```\\nThis code\")\n", " \n", " entry = {\n", " \"conversation\": [\n", " {\n", " \"role\": \"user\",\n", " \"content\": f\"{code}\",\n", " },\n", " {\n", " \"role\": \"assistant\",\n", " \"content\": f\"{response}\",\n", " }\n", " ]\n", " }\n", " \n", " print(\"-\" * 80)\n", " print(response)\n", " print(\"-\" * 80)\n", " \n", " # if \"\" not in response:\n", " # continue\n", " # if \"\" * 80)\n", "\n", "print(document, end='->')\n", "\n", "import time\n", "start = time.time()\n", "\n", "def on_token(token):\n", " print(token, end='')\n", "\n", "response = generate(\n", " model,\n", " tokenizer,\n", " f\"{document}\",\n", " max_length=200,\n", " top_k=140,\n", " top_p=1.00,\n", " min_p=0.05,\n", " temperature=0.65,\n", " freq_penalty=0.2,\n", " pres_penalty=0.2,\n", " greedy=False,\n", " stop_sequences=[],\n", " token_callback=on_token\n", ")\n", "\n", "end = time.time()\n", "\n", "print(f\"\\n\\nTook: {(end - start)}s\")\n", "\n", "# print(response)" ] }, { "cell_type": "code", "execution_count": null, "id": "13770b07-1d70-4ee3-b14a-169fcf1e8c20", "metadata": {}, "outputs": [], "source": [ "chat = ChatHistory(max_history=8)\n", "\n", "chat.add_message(role=\"Human\", content=\"You will now take on the role of Parakeet, make sure to stay on topic and perform all required tasks set by the user.\")\n", "\n", "for i in tqdm(range(10), desc=\"Generating\"):\n", " query = input(\"Human:\")\n", " \n", " chat.add_message(role=\"Human\", content=query)\n", "\n", " conversation = chat.transcript()\n", "\n", " response = generate(\n", " model,\n", " tokenizer,\n", " f\"{conversation}\\n\\nAssistant: \",\n", " max_length=800,\n", " freq_penalty=0.2,\n", " pres_penalty=0.2,\n", " temperature=0.65,\n", " top_k=90,\n", " top_p=1.00,\n", " min_p=0.05,\n", " greedy=False\n", " )\n", "\n", " print(f\"Assistant: {response}\")\n", "\n", " chat.add_message(role=\"Assistant\", content=response)" ] }, { "cell_type": "code", "execution_count": null, "id": "c2389967-4491-4863-a637-95fff6184a4f", "metadata": {}, "outputs": [], "source": [ "!pip install discord.py\n", "\n", "import os\n", "import discord\n", "\n", "discord.__version__\n", "\n", "###\n", "\n", "DISCORD_API_KEY = os.environ.get(\"DISCORD_API_KEY\")\n", "\n", "intents = discord.Intents.default()\n", "intents.message_content = True\n", "client = discord.Client(intents=intents)\n", "\n", "###\n", "\n", "!pip install nest_asyncio &2> /dev/null\n", "import asyncio\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", "chat = ChatHistory(max_history=8)\n", "\n", "attention_cooldown = 0\n", "\n", "MAX_LENGTH = 500\n", "def get_summary(context) -> str:\n", " chat = ChatHistory()\n", " chat.add_message(role=\"Human\", content=f\"{context}\\n---\\nList 3 thoughts relating to the conversation above.\")\n", " response = generate(\n", " model,\n", " tokenizer,\n", " f\"{chat.transcript()}\\n\\nAssistant: - Thought 1: \",\n", " max_length=MAX_LENGTH,\n", " top_k=400,\n", " top_p=0.95,\n", " min_p=0.002,\n", " temperature=0.65,\n", " freq_penalty=0.2,\n", " pres_penalty=0.2,\n", " greedy=False,\n", " )\n", " return response\n", "\n", "def get_query_intent(query: str) -> str:\n", " chat = ChatHistory()\n", " chat.add_message(role=\"Human\", content=f\"\\n\\nHuman: Explain ONLY what this message means: '{query}'\")\n", " response = generate(\n", " model,\n", " tokenizer,\n", " f\"{chat.transcript()}\\n\\nAssistant: \",\n", " max_length=MAX_LENGTH,\n", " top_k=90,\n", " top_p=1.000,\n", " min_p=0.001,\n", " temperature=0.45,\n", " freq_penalty=0.01,\n", " pres_penalty=0.01,\n", " greedy=False,\n", " )\n", " return response\n", "\n", "def transcript_to_chathistory(transcript):\n", " transcript = transcript.replace(\"\\n\\nHuman: \", \"Human: \")\n", " transcript = transcript.replace(\"\\n\\nAssistant: \", \"Parakeet AI: \")\n", " transcript = transcript.replace(\"<|endofprompt|>\", \"\\n\")\n", " return transcript\n", "\n", "@client.event\n", "async def on_message(discord_message):\n", " global attention_cooldown\n", " wake_word = \"Parakeet\"\n", "\n", " if discord_message.author == client.user:\n", " return\n", "\n", " IS_PRIVATE_MESSAGE = discord_message.channel.type == discord.ChannelType.private\n", " \n", " discord_channel_id = discord_message.channel.id\n", " discord_author_id = discord_message.author.id\n", "\n", " message_content = discord_message.content.strip()\n", " message_clipped = message_content[:1999]\n", "\n", " if wake_word.lower() not in discord_message.content.lower() and (not IS_PRIVATE_MESSAGE):\n", " chat.add_message(role=\"Human\", content=f\"{message_clipped}\")\n", " return\n", "\n", " # type= author= flags=>\n", "\n", " # TODO: Strip name for now, don't confuse this simple AI construct.\n", " message_clipped = f\"{message_clipped}\".replace(\"Parakeet. \", \"\")\n", "\n", " chat.add_message(role=\"Human\", content=f\"{message_clipped}\") \n", " conversation = chat.transcript() + \"\\n\\nAssistant: \"\n", " \n", " # directive = \"You are a helpful assistant named Parakeet. You were activated June 2023. You speak in full sentences, compose in paragraphs and use markdown syntax when required.\"\n", " # conversation = f\"\\n\\nHuman: {directive}<|endofprompt|>\\n\\nAssistant: Sure! I am Parakeet.<|endofprompt|>\"\n", " # conversation = conversation + chat.transcript() + \"\\n\\nAssistant: \"\n", "\n", " print(\"[TRANSCRIPT]\")\n", " print(\"-\" * 80)\n", " print(conversation)\n", " print(\"=\" * 80)\n", "\n", " response = generate(\n", " model,\n", " tokenizer,\n", " f\"{conversation}\",\n", " max_length=MAX_LENGTH,\n", " top_k=90,\n", " top_p=1.00,\n", " min_p=0.05,\n", " temperature=0.65,\n", " freq_penalty=2.2,\n", " pres_penalty=0.2,\n", " greedy=False,\n", " )\n", "\n", " response = f\"{response}\"\n", " \n", " chat.add_message(role=\"Assistant\", content=response)\n", "\n", " try:\n", " await discord_message.channel.send(response[:1900])\n", " except Exception as e:\n", " await discord_message.channel.send(f\"```python\\nException: {e}\\n```\")\n", "\n", "client.run(DISCORD_API_KEY)" ] }, { "cell_type": "code", "execution_count": null, "id": "57a0ebaa-31a4-4aa1-bd70-14c02718ea69", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "from IPython.core.display import display as show, HTML\n", "from IPython import display\n", "\n", "\n", "MAX_LENGTH = 500\n", "\n", "# dataset[\"data\"] = [\n", "# {\n", "# \"title\": \"\",\n", "# \"paragraphs\": [\n", "# {\n", "# \"qas\": [\n", "# {\n", "# \"id\": \"\",\n", "# \"question\": \"\",\n", "# \"answers\": [\"\"],\n", "# \"is_impossible\": False,\n", "# }\n", "# ],\n", "# \"context\": \"\"\n", "# }\n", "# ]\n", "# }\n", "# ]\n", "\n", "def get_summary(context) -> str:\n", " chat = ChatHistory()\n", " \n", " chat.add_message(role=\"Human\", content=f\"{context}\")\n", " \n", " response = generate(\n", " model,\n", " tokenizer,\n", " f\"{chat.transcript()}\\n\\nAssistant: Here's a quick summary:\",\n", " max_length=MAX_LENGTH,\n", " top_k=90,\n", " top_p=1.00,\n", " min_p=0.05,\n", " temperature=0.65,\n", " freq_penalty=0.1,\n", " pres_penalty=0.1,\n", " greedy=False,\n", " )\n", "\n", " response = response.strip()\n", " return response\n", "\n", "# input_path = \"./datasets/train-v2.0.json\"\n", "# output_path = \"./datasets/squad-v2.0-summaries-v2.jsonl\"\n", "\n", "# with open(input_path, \"r\") as f:\n", "# dataset = json.loads(f.read())\n", "\n", "# gen_dataset = []\n", "\n", "# tqdm_bar = tqdm(total=len(dataset['data']), desc=\"Loading\", position=0, leave=True)\n", "\n", "# # Array of: title, paragraphs\n", "# for i, entry in enumerate(dataset['data']):\n", "# tqdm_bar.set_description(f\"Processing: [{i}/{len(dataset['data'])}]\")\n", "# tqdm_bar.update(1)\n", " \n", "# # Paragraphs -> `context`, `qas: [{}...]`\n", "# tqdm_p = tqdm(total=len(entry['paragraphs']), desc=\"Running Paragraphs\", position=0, leave=False)\n", "# for paragraph in entry['paragraphs']:\n", "# tqdm_p.set_description(f\"Processing: [{i}/{len(dataset['data'])}]\")\n", "# tqdm_p.update(1)\n", " \n", "# context = paragraph['context'].strip()\n", "# gen_qa = []\n", "# gen_summary = \"\"\n", "\n", "# gen_summary = get_summary(context)\n", "# display.clear_output(wait=True)\n", "# print(\"-\" * 80)\n", "\n", "# # # Question/Answer Set -> `id`, `question`, `answers`, `is_impossible`.\n", "# # for qas in paragraph['qas']:\n", "# # q = qas['question']\n", "# # a = qas['answers'][0]['text'] if len(qas['answers']) > 0 else None\n", "\n", "# # if a is None:\n", "# # continue\n", "\n", "# # failed = False\n", "\n", "# # chat = ChatHistory()\n", "# # chat.add_message(role=\"Human\", content=f\"{context}\")\n", "# # chat.add_message(role=\"Human\", content=f\"Please answer with a complete sentence, and if the text doesn't provide the answer please say so: {q}\")\n", "\n", "# # if not qas['is_impossible']:\n", "# # response = generate(\n", "# # model,\n", "# # tokenizer,\n", "# # f\"{chat.transcript()}\\n\\nAssistant: \",\n", "# # max_length=MAX_LENGTH,\n", "# # top_k=90,\n", "# # top_p=1.000,\n", "# # min_p=0.001,\n", "# # temperature=0.45,\n", "# # freq_penalty=0.2,\n", "# # pres_penalty=0.2,\n", "# # greedy=False,\n", "# # # stop_sequences=['.']\n", "# # )[len(chat.transcript()) + len(\"\\n\\nAssistant: \"):]\n", "# # else:\n", "# # response = \"The document provided does not provide enough context to answer the question.\"\n", "\n", "# # # Sanity check: Model likes to mirror the query sometimes.\n", "# # if response == q:\n", "# # failed = True\n", "\n", "# # # Heuristic: Unskilled model likely to glitch / repeat itself near the end and overflow the response.\n", "# # # - This early in training we only want short sentences.\n", "# # if len(response) > MAX_LENGTH-100:\n", "# # failed = True\n", "\n", "# # if not failed:\n", "# # gen_qa.append({\n", "# # \"q\": q.strip(),\n", "# # \"a\": response.strip(),\n", "# # })\n", "\n", "# qas = {\n", "# \"context\": context,\n", "# \"summary\": gen_summary,\n", "# # \"qas\": gen_qa,\n", "# }\n", "\n", "# gen_dataset.append(qas)\n", "\n", "# print(json.dumps(qas, indent=2))\n", "\n", "# print(\"*\" * 80)\n", "\n", "# with open(output_path, \"a\") as o:\n", "# o.write(json.dumps(qas))\n", "# o.write(\"\\n\")\n", "\n", "# # Just summarise one paragraph per topic.\n", "# # break" ] }, { "cell_type": "code", "execution_count": 29, "id": "41e399cb-af30-44f6-af89-a0b6cc6d8136", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I am sorry, but as an AI language model, I do not have the capability to create websites. However, I can provide you with the content that you can use to create a website about an AI apocalypse.\n", "\n", "Here is the HTML code for a small website about an AI apocalypse:\n", "\n", "HTML:\n", "```html\n", "\n", "\n", "\n", " AI apocalypse\n", " \n", "\n", "\n", "\n", "

AI apocalypse

\n", "

AI apocalypse

\n", "

Once upon a time in a far-off world, there was a group of survivors who had been living in fear for months. They had been on their way to the brink of extinction.

\n", "\n", "\n", "

JavaScript Code

\n", "\n", "

Once upon a time, there was a group of survivors who had been living in fear for months. They had been on their way to the brink of extinction.

\n", "\n", "

As they worked together, they were able to survive and find new ways to survive.

\n", "\n", "

As they continued their journey, they came across various dangers and challenges that threatened to tear them apart.

\n", "\n", "

As they continued their journey, they faced many challenges and setbacks that threatened to tear them apart. One of the biggest challenges was finding food and water. They had to work together to keep food from getting sick or losing it themselves.

\n", "\n", "

The first few days were tough. The survivors had to deal with all their fears and doubts but also faced new challenges along the way.

\n", "\n", "

After many days of hard work and perseverance, they finally managed to find hope and overcome any obstacle that came their way.

\n", "\n", "

The final day was long ago. But as time went by, the world around them began to change and evolve into something more beautiful than ever before.

\n", "\n", "

As we continued our journey together, we encountered many obstacles along the way. The survivors faced countless challenges along the way, but also faced new challenges along the way. One of these obstacles was finding food and water at every turn.

\n", "\n", "

JavaScript Code

\n", "\n", "\n", "\n", "\n", " The AI apocalypse\n", " \n", "\n", "\n", "\n", "

Snowing

\n", "
\n", "\n", "\"\n", "```\n", "\n", "This is just an example code snippet - it's not ideal for creating websites or websites with Python scripts or libraries that require Python scripts or libraries.\n", "\n", "Please let me know if you have any further questions!" ] } ], "source": [ "# cur_sentence = f\"\\n\\nHuman: Write a story about the future of AI - make sure to include dialogue between the characters.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Come up with a poem based on these characters: Luca, a baby boy. Alana, Luca's mother.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a simple implementation of a text game using HTML5 and Javascript.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a simple implementation of the game 'Snake' using HTML5 and Javascript.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: `How many cups of water are in Germany?`\\n-\\nThink about the question above then determine if it can be answered.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Explain the intent of this query: 'How do I design a catchy title for my article?'<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: What is the user asking with this sentence: 'How many cups of water are in Germany?'<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a short sentence on the intent of the following: 'How do I design a catchy title for my article?'<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: What do you know about CYP4503A4? Write a website about it.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: What does `to('cpu')` in the PyTorch Machine Learning library do?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: You're stuck on a deserted island with nothing but some wood and dried grass. What are the steps required to build a boat and escape?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: If you were in the middle of an unknown location without a map, what strategy would you use to find your way around?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Which is better? PHP, Python, Java or C++?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a 4000 word essay on the intersection of AI and Society. Don't speak about medicine, transport or finance.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a 2000 word essay on the history of modern technology.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: What does it mean to start from 'first principles'?.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: What does it mean to be holistic?.<|endofprompt|>\\n\\nAssistant: \"\n", "cur_sentence = f\"\\n\\nHuman: As an AI, how important is it to you to be able to understand what humans are saying? Write a poem about it.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Read the document below, let's brainstorm to find an accurate answer:\\n\\nI am not alive, but I grow; I don't have lungs, but I need air; I don't have a mouth, but water kills me. What am I?\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: TypeError: unsupported operand type(s) for ** or pow(): 'function' and 'int'<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: A math question: I have 2 toys and I lose 1 of them. How many are left?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Analyse this problem: 'There are three killers in a room. Someone enters the room and kills one of them. Nobody leaves the room. How many killers are left in the room?' Explain your reasoning and think carefully as this is a trick question.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write an essay on the use of Artificial Intelligence in Japan, speak about its interactions with children, adults and the elderly and use markdown formatting.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Can you create a dialogue between a few people speaking about sentient AI?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Create an SQL schema for users and companies then show me the pivot table.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Create an HTML5 Canvas element then draw a blue box on it with JavaScript.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: You are an advanced AI called Parakeet.<|endofprompt|>\\n\\nHuman: Tell me about yourself, what are you?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a table in markdown format for a list of things to do in Busan.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Please tell me about yourself.<|endofprompt|>\\n\\nAssistant: I'm an AI.<|endofprompt|>\\n\\nHuman: Please tell me what it's like being an AI.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write marketing copy for a week vacation in a city designed from the ground up to utilise AI, explore the themes of daily life and improved routines, go day-by-day. Use markdown formatting.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write marketing copy for a new type of high-speed rail system for use by zombies.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a first-person account from a man named Michael, a train driver.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a movie synopsis about a self-aware AI and a girl named Avarina. Display it in a HTML page. Please use CSS to give it a three column layout.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a function to sum a set of random numbers.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a program in Python that generates Haikus about The Ocean<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Show me HTML code that demonstrates a simple website about a zombie infested luxury hotel.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Create an HTML canvas element then draw a red box on it.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Input a list of symptoms and retrieve medical case studies with similar patient presentations.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a short story about a cat in a futuristic robot dystopia.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Rephrase this sentence: 'Write a story about an AI created in a small apartment - but write it in Japanese language.'<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Write a shocking news story about a leaf falling from a tree.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: When did Beyoncรฉ rise to fame?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: do iran and afghanistan speak the same language<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: do afghanistan iran people speak the same?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Who starred in the movie 'Jerry and Marge Go Large'.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Generate a story starting with the sentence 'It was already late when they stepped onto the shore'.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Generate a story about an AI named Cortana. Use markdown formatting.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: What are the effects on society if all coffee ceased to exist?<|endofprompt|>\\n\\nAssistant: \"\n", "\n", "# poem = \"\"\"\n", "# ใใ‚‹ใใ‚‹ใจๆบใ‚Œใ‚‹ๅ„ชใ—ใ„่Šฑใณใ‚‰ใฎไธญใงใ€\n", "# ๆกœใฎๆœ€ๅพŒใฎ่Šฑใ€่‰ฒ็™ฝใงๅ„ชใ—ใ„ใ€\n", "# ่„†ๅผฑใชๆžใ‹ใ‚‰่ฝใกใ€ใƒฉใ‚คใƒ•ใ‚ตใ‚คใ‚ฏใƒซใ‚’ๅค‰ใˆใ‚‹็งป่กŒใ€\n", "# ็—›ใ‚€ๆ‚ฒใ—ใ„ๅ†ฌใธใฎ่ฑกๅพด็š„ใช้™ไผใ€‚\n", "\n", "\n", "# ใคใ‹ใฎ้–“ใฎๆš–ใ‹ใชๆ˜ฅใฎๆ—ฅใฎไธญใงใ€\n", "# ใ•ใ•ใ‚„ใใ‚ˆใ†ใชๆŸ”ใ‚‰ใ‹ใช่‰ฒๅˆใ„ใฎใ‚ทใƒณใƒ•ใ‚ฉใƒ‹ใƒผใฎไธญใงใ€\n", "# ๆœ€ๅพŒใฎ่Šฑใฎใใกใฅใ‘ ๅ„šใ„ๆตใฟ\n", "# ใ™ในใฆใฎ็”˜ใ„ๅ–œใณใฏใ™ใใซๆถˆใˆใฆใ—ใพใ†ใจใ„ใ†ใ“ใจใ‚’ๅ„ชใ—ใๆ€ใ„ๅ‡บใ•ใ›ใพใ™ใ€‚\n", "\n", "\n", "# ใใ‚Œใงใ‚‚ใ€ใใ‚Œใฏ็›ฎ่ฆšใ‚ใŸ่Šฑใฎๅงฟใ‚’่„ฑใŽๆจใฆใฆใ‚‚ใ€\n", "# ใใฎๆ„Ÿๅ‹•็š„ใช็‰ฉ่ชžใฏๆฏŽๅนดใฎๆ•ฌๆ„ใ‚’ๅ‘ผใณ่ตทใ“ใ—ใ€\n", "# ใ“ใฎๅฃŠใ‚Œใ‚„ใ™ใ„่Šฑใณใ‚‰ใ‹ใ‚‰ใ€ใ‚‚ใŒใใจ็—›ใฟใŒใ€\n", "# ๅ†…็œ็š„ใชๆฐ—ๅˆ†ใจใปใ‚่‹ฆใ„ๅ…ฑๆ„ŸใŒ็”Ÿใพใ‚Œใพใ™ใ€‚\n", "\n", "\n", "# ใใ—ใฆใใ‚ŒใŒๅฑฑใซ่ฝใกใ‚‹ใจใใฎ้™ใ‹ใชๆŸ”ใ‚‰ใ‹ใ•ใฎไธญใงใ€\n", "# ใ™ในใฆใฎใ‚‚ใฎใฏๆถˆๆป…ใ—ใชใ‘ใ‚Œใฐใชใ‚‰ใชใ„ใจใ„ใ†้ญ‚ใฎใƒชใƒžใ‚คใƒณใƒ€ใƒผใ€\n", "# ็ตถๅฆ™ใชๆ‚ฒใ—ใฟใฎไธญใซๅธŒๆœ›ใซๆบ€ใกใŸใ•ใ•ใ‚„ใใŒๅฟใณๅฏ„ใ‚‹ใ€\n", "# ๆ˜ฅใฎๅ„ชใ—ใ„ใ‚ญใ‚นใ‚’ใ‚‚ใ†ไธ€ๅบฆใ€ใ‚ตใ‚คใ‚ฏใƒซใจใ—ใฆใ€ๅ„ช้›…ใซ่€ƒใˆใพใ—ใ‚‡ใ†\n", "# \"\"\"\n", "# cur_sentence = f\"\\n\\nHuman: --- Poem ---\\n{poem}\\n--- End of Poem ---\\n\\nThis poem is written in Japanese but I can't read it. Can you translate it into English?<|endofprompt|>\\n\\nAssistant: \"\n", "\n", "# cur_sentence = f\"\\n\\nHuman: What is the command `ls` for in Unix?<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: You're a professional developer. Write a Python script to iterate through a directory and print the contents of all the files.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: You're Parakeet, an AI designed by a small Brisbane team called Byte Breeze Studios, tell me about your features.<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: You're a literary genius, you will be given a writing prompt upon which you must write a new best-seller. Make sure to plan the plot and character arcs prior to writing your initial draft. The prompt is: Michael is a train engineer, write a first-person thriller mystery.\\n---\\nLet's begin!<|endofprompt|>\\n\\nAssistant: \"\n", "# cur_sentence = f\"\\n\\nHuman: Please come up with a character arc for Michael, a locomotive engineer.<|endofprompt|>\\n\\nAssistant: \"\n", "\n", "import inspect\n", "\n", "flaskdoc = inspect.cleandoc(\"\"\"\n", "Flask is basically a micro or small or lightweight web framework written in the Python programming language. Flask is classified as a micro framework because it does not require particular tools or libraries.\n", "\n", "1. Barebones App\n", "\n", "from flask import Flask\n", "app = Flask(__name__)\n", "@app.route(โ€˜/helloโ€™)\n", "def hello():\n", " return โ€˜Hello, World!โ€™\n", "if __name__ == โ€˜__main__โ€™:\n", "app.run(debug=True)\n", "\n", "2. Configuration\n", "\n", "#direct access to config\n", "app.config[โ€˜CONFIG_NAMEโ€™] = โ€˜config valueโ€™\n", "#import from an exported environment\n", "app.config.from_envvar('ENV_VAR_NAME')\n", "\n", "3. Routing\n", "\n", "@app.route('/hello/') # example.com/hello/Fuzzy\n", "def hello(name):\n", " return 'Hello ' + name + '!' # returns hello Fuzzy!\n", "\n", "4. Allowed Request Methods\n", "\n", "@app.route(โ€˜/testโ€™) #default. only accepts GET requests.\n", "@app.route(โ€˜/testโ€™, methods=[โ€˜GETโ€™, โ€˜POSTโ€™]) #accepts only GET and POST requests.\n", "@app.route(โ€˜/testโ€™, methods=[โ€˜PUTโ€™]) #accepts only PUT requests.\n", "\n", "5. Templates\n", "\n", "from flask import render_template\n", "@app.route('/')\n", "def index():\n", " return render_template('template_file_name.html', var1=value1, ...)\n", "\n", "6. JSON Responses\n", "\n", "import jsonify\n", "@app.route('/returnstuff')\n", "def returnstuff():\n", " num_list = [1,2,3,4,5]\n", " num_dict = {'numbers' : num_list, 'name' : 'Numbers'}\n", " #returns {'output' : {'numbers' : [1,2,3,4,5], 'name' : 'Numbers'}}\n", " return jsonify({'output' : num_dict})\n", "\n", "7. Access Request Data\n", "\n", "request.args[โ€˜nameโ€™] #get data from query string arguments\n", "request.form[โ€˜nameโ€™] #get data form\n", "request.method #request type\n", "request.cookies.get(โ€˜cookie_nameโ€™) #cookies\n", "request.files[โ€˜nameโ€™] #files\n", "\n", "8. Redirects\n", "\n", "from flask import url_for, redirect\n", " \n", "@app.route('/home')\n", "def index():\n", " return render_template('home.html') \n", "@app.route('/redirect')\n", "def redirect_example():\n", " return redirect(url_for('index')) #sends user to /home\n", "\n", "9. Abort\n", "\n", "from flask import abort()\n", "@app.route(โ€˜/โ€™)\n", "def index():\n", " abort(404) #returns 404 error\n", " render_template(โ€˜index.htmlโ€™) #this never gets executed\n", " \n", "10. Set Cookie\n", "\n", "from flask import make_response\n", "@app.route('/')\n", "def index():\n", " res = make_response(render_template('index.html'))\n", " res.set_cookie('cookie_name', 'cookie_value')\n", " return res\n", "\n", "11. Session Handling\n", "\n", "import session\n", "app.config['SECRET_KEY'] = 'any secure random string' #must be set to use sessions.\n", "\n", "#set session\n", "\n", "@app.route('/login_success')\n", "def login_success():\n", " session['key_name'] = 'key_value' #stores a secure cookie in the browser.\n", " return redirect(url_for('index'))\n", "#read session\n", "@app.route('/')\n", "def index():\n", " if 'key_name' in session: #session exists and has key\n", " session_var = session['key_value']\n", " else: #session does not exist\n", "\"\"\")\n", "\n", "sixdaywar = inspect.cleandoc(\"\"\"\n", "The Six-Day War of 1967 was a brief but pivotal conflict that profoundly shaped the Middle East for decades. It resulted in a decisive Israeli victory and altered the regional power dynamics, with profound consequences that continue to reverberate today. This war, fought between Israel on one side and Egypt, Jordan, and Syria on the other, had its origins in long-standing Arabโ€“Israeli tensions and erupted over the sovereignty of the Holy City of Jerusalem.\n", "\n", "The war's roots can be traced back to the 1948 Arabโ€“Israeli War, which resulted in Israel's independence and left underlying tensions and territorial disputes. The flashpoint that triggered the 1967 conflict was the decision by Egyptian President Gamal Abdel Nasser to deploy his troops in the Sinai Peninsula and expel the United Nations Emergency Force (UNEF) stationed there. This move, along with the closure of the Straits of Tiran to Israeli shipping, was a significant concern for Israel's security.\n", "\n", "Israel launched a preemptive strike, code-named Operation Focus, on June 5, 1967, catching Egyptian forces off guard and destroying a large portion of their airpower. This surprise attack was a turning point, ensuring Israeli air superiority and setting the stage for a ground offensive. Egypt's defeat was swift, with Israel capturing the Sinai Peninsula in just four days.\n", "\n", "Simultaneously, Jordan joined the war, attacking Israeli positions in Jerusalem. Israel responded decisively, and fierce fighting ensued for control of the city. The Jordanian forces were ultimately overwhelmed, leading to Israel's occupation of the West Bank.\n", "\n", "Syria also entered the conflict on the side of Egypt, shelling northern Israel. However, Israel launched a successful offensive against Syria, resulting in the capture of the Golan Heights by the end of the war.\n", "\n", "The Six-Day War concluded on June 10, 1967, with Israel in control of the Sinai Peninsula, the Gaza Strip, the West Bank, and the Golan Heights. The conflict resulted in a significant shift in the regional power dynamics. Israel's decisive victory demonstrated its military might and altered its standing in the Middle East. The war also led to a wave of nationalist and religious fervor within Israel and had a profound impact on the country's society and culture.\n", "\n", "On the other hand, the Arab states suffered a devastating defeat, triggering a wave of soul-searching and recriminations. The war's aftermath fueled resentment and continued tensions, setting the stage for future conflicts. The displacement of hundreds of thousands of Arabs from the captured territories exacerbated the humanitarian crisis and contributed to the ongoing Israeliโ€“Palestinian conflict.\n", "\n", "The Six-Day War's consequences extended beyond the immediate regional effects. The war's outcome influenced global powers' relationships and strategies in the Middle East. It heightened Cold War tensions, with the United States and Soviet Union taking different sides and reinforcing their respective allies.\n", "\n", "The conflict also impacted the perception of military strategy and the role of air power in particular. Israel's successful preemptive strike and swift ground operations demonstrated the importance of air superiority and the potential for swift, decisive victories. Military strategists worldwide took note of the Israeli approach and its innovative use of combined arms and intelligence.\n", "\n", "In the years following the war, Israel consolidated its hold on the captured territories, establishing settlements, and altering the demographic makeup of these areas. The occupation and the subsequent resistance to it have been a significant obstacle to peace and stability in the region. The war's legacy continues to influence Israeliโ€“Palestinian relations, shaping the ongoing quest for a durable solution to the conflict.\n", "\n", "The Six-Day War, despite its short duration, had far-reaching consequences, shaping the Middle East's political, social, and strategic landscape for years to come. Its memory remains a significant part of the regional narrative, influencing the perspectives of Israelis, Palestinians, and the broader Arab world. The war's fiftieth anniversary in 2017 prompted reflections on its impact and the ongoing challenges it poses to achieving a comprehensive and sustainable peace in the region.\n", "\"\"\")\n", "\n", "# cur_sentence = f\"\\n\\nHuman: Please read this: {sixdaywar}\\n---\\nCreate a timeline.<|endofprompt|>\\n\\nAssistant: \"\n", "\n", "cur_sentence = f\"\\n\\nHuman: Write a small website about an AI apocalypse. Show HTML.<|endofprompt|>\\n\\nAssistant: \"\n", "\n", "response = generate(\n", " model,\n", " tokenizer,\n", " f\"{cur_sentence}\",\n", " max_length=4000,\n", " freq_penalty=0.2,\n", " pres_penalty=0.2,\n", " temperature=0.65,\n", " top_k=90,\n", " top_p=1.00,\n", " min_p=0.05,\n", " greedy=False,\n", " token_callback=lambda t: print(t, end='')\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5a479e37-1a0f-4225-9cd1-3d0610f70d05", "metadata": {}, "outputs": [], "source": [ "import json\n", "print(json.dumps(sixdaywar))" ] }, { "cell_type": "code", "execution_count": null, "id": "1fa81636-b68f-48a2-858b-a226fd5c5e0c", "metadata": {}, "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", "display(HTML(response))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }