LAION-AI · natek-1 · Jun 20, 2024 · Jun 20, 2024 · Jun 22, 2024 · Jun 29, 2024
diff --git a/README.md b/README.md
@@ -1,14 +1,76 @@
-https://laion.ai/notes/open-gpt-4-o/
+# Emo-2-SNAC to LLaMA 3 Audio Token Conversion
 
+This repository contains scripts and documentation for converting audio samples from the [0xd4t4/Emo-2-SNAC](https://huggingface.co/0xd4t4/Emo-2-SNAC) dataset into tokens compatible with the LLaMA 3 language model. This project is part of a collaboration with LAION to develop advanced audio processing capabilities for [AI assistants](https://laion.ai/notes/open-gpt-4-o/).
 
-## Instructions
+## Table of Contents
 
-you might want to create an enviroment for this (optioinal) and activate it
+- [Introduction](#introduction)
+- [Dataset](#dataset)
+- [Requirements](#requirements)
+- [Installation](#installation)
+- [Usage](#usage)
+- [Project Structure](#project-structure)
+- [License](#license)
+- [Acknowledgements](#acknowledgements)
 
+## Introduction
+
+The goal of this project is to enhance the audio processing capabilities of the LLaMA 3 language model by converting audio samples into discrete tokens. These tokens will then be used to train the model to understand and generate audio, similar to how it processes text.
+
+## Dataset
+
+The dataset used in this project is the [0xd4t4/Emo-2-SNAC](https://huggingface.co/0xd4t4/Emo-2-SNAC) dataset, available on Hugging Face. This dataset contains audio samples labeled with emotional content, which will be converted into SNAC (Multi-Scale Neural Audio Codec) tokens.
+
+## Requirements
+
+- Python 3.8 or higher
+- hugging face
+
+## Installation
+
+(Optional) Create your own enviroment
+
+## Usage
+
+1. Clone the repository:
+
 ``` bash
-pip intall -r requirement.txt 
+git clone https://github.com/LAION-AI/snac-to-llama3.git
+cd snac-to-llama3
 ```
 
+2. Install the requirments
+
+``` bash
+pip install -r requirements.txt
+```
+
+3. Download The hugging face dataset
+
 ``` bash
 huggingface-cli download 0xd4t4/Emo-2-SNAC --local-dir ./dataset --revision refs/convert/parquet --repo-type dataset
 ```
+
+4. Integrate SNAC codec with LLaMA 3 model:
+
+```bash
+python codec-to-token.py
+```
+
+5. Byte pair encoder
+
+## Project Structure
+
+- `codec-to-token.py`: Script for converting audio samples to SNAC tokens.
+- `byte-pair-encoding.py`: Script for integrating SNAC tokens with the LLaMA 3 model.
+- `requirements.txt`: List of dependencies required for the project.
+- `README.md`: This file.
+
+
+## License
+
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
+
+## Acknowledgements
+
+This project is a collaboration with [LAION](https://laion.ai/). Special thanks to the Hugging Face team for providing the [0xd4t4/Emo-2-SNAC](https://huggingface.co/0xd4t4/Emo-2-SNAC) dataset.
diff --git a/codec-to-token.py b/codec-to-token.py
@@ -0,0 +1,61 @@
+from datasets import load_dataset
+import json
+
+
+class Vocabulary:
+    """Class to map codes from huggingface dataset to tokens in Llama 3-8B token"""
+
+    def __init__(self):
+        self.stoi = {}
+        self.itos = {}
+
+    def build_vocabulary(self, parquet_files, tokenizer_file="tokenizer.json"):
+        '''
+        creates the vocabulary from the Llama 3 tokenizer and hugging face dataset
+        Args:
+            tokenizer_file(str): file downloaded from Llama 3(8B) which contains the vocabulary for the model
+            parquet_files(list): director with the dataset from hugging face in parquet format
+
+        '''
+        # Open the JSON file
+        with open(tokenizer_file, 'r') as file:
+            # Load the JSON data
+            data = json.load(file)
+
+        llama_stoi = data['model']['vocab']
+        llama_itos = {value:key for key,value in llama_stoi.items()}
+
+        #load hugging face data
+        dataset = load_dataset('parquet', data_files=parquet_files)
+        vocabulary = set()
+
+        for sent in dataset["train"]["txt"]:
+            for word in sent.split():
+                vocabulary.add(word)
+
+        self.itos = {value:llama_itos[int(value)] for value in vocabulary}
+        self.stoi = {value:key for key,value in self.itos.items()}
+
+    def save(self, file_path):
+        '''
+        saves the the int to string(llama token) to a json file to be loaded later
+        Args:
+            file_path(str): file where dict will be saved
+
+        '''
+        with open(file_path, "w") as file:
+            json.dump(self.itos, file, indent=4)
+
+
+if __name__ == "__main__":
+    test_dir = [f"dataset/default/partial-train/000{i}.parquet" for i in range(10)]
+    vocab = Vocabulary()
+    # buidling the vocabulary
+    vocab.build_vocabulary(test_dir)
+
+    # saving it to file
+    vocab.save("snac-to-llama.json")
+
+
+
+
diff --git a/main.py b/main.py
@@ -1,53 +0,0 @@
-from datasets import load_dataset
-import json
-
-
-class Vocabulary:
-    """Class to map codes from huggingface dataset to tokens in Llama 3-8B token"""
-
-    def __init__(self):
-        self.stoi = {}
-        self.itos = {}
-
-    def build_vocabulary(self, parquet_files, tokenizer_file="tokenizer.json"):
-        '''
-        creates the vocabulary from the Llama 3 tokenizer and hugging face dataset
-        Args:
-            tokenizer_file(str): file downloaded from Llama 3(8B) which contains the vocabulary for the model
-            parquet_files(list): director with the dataset from hugging face in parquet format
-
-        '''
-        # Open the JSON file
-        with open(tokenizer_file, 'r') as file:
-            # Load the JSON data
-            data = json.load(file)
-
-        llama_stoi = data['model']['vocab']
-        llama_itos = {value:key for key,value in llama_stoi.items()}
-
-        #load hugging face data
-        dataset = load_dataset('parquet', data_files=parquet_files)
-        vocabulary = set()
-
-        for sent in dataset["train"]["txt"]:
-            for word in sent.split():
-                vocabulary.add(word)
-
-        self.itos = {int(value):llama_itos[int(value)] for value in vocabulary}
-        self.stoi = {value:key for key,value in self.itos.items()}
-
-    def save(self, file_path):
-        with open(file_path, "w") as file:
-            json.dump(self.itos, file, indent=4)
-
-
-if __name__ == "__main__":
-    test_dir = [f"dataset/default/partial-train/000{i}.parquet" for i in range(10)]
-    vocab = Vocabulary()
-
-    vocab.build_vocabulary(test_dir)
-    vocab.save("snac-to-llama.json")
-
-
-
-

diff --git a/test.ipynb → research/test.ipynb b/test.ipynb → research/test.ipynb
@@ -156,7 +156,7 @@
     "        llama_itos = {value:key for key,value in llama_stoi.items()}\n",
     "\n",
     "        #load hugging face data\n",
-    "        dataset = load_dataset('parquet', data_files=test_dir)\n",
+    "        dataset = load_dataset('parquet', data_files=parquet_files)\n",
     "        vocabulary = set()\n",
     "\n",
     "        for sent in dataset[\"train\"][\"txt\"]:\n",