huggingface · qgallouedec · Sep 11, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -1,12 +1,14 @@
 - sections:
   - local: index
     title: TRL
-  - local: quickstart
-    title: Quickstart
   - local: installation
     title: Installation
+  - local: quickstart
+    title: Quickstart
   - local: clis
     title: Get started with Command Line Interfaces (CLIs)
+  - local: dataset_formats
+    title: Dataset Formats
   - local: how_to_train
     title: PPO Training FAQ
   - local: use_model
@@ -59,6 +61,8 @@
     title: Judges
   - local: callbacks
     title: Callbacks
+  - local: data_utils
+    title: Data Utilities
   - local: text_environments
     title: Text Environments
   title: API

diff --git a/docs/source/data_utils.mdx b/docs/source/data_utils.mdx
@@ -0,0 +1,15 @@
+## Data Utilities
+
+[[autodoc]] is_conversational
+
+[[autodoc]] apply_chat_template
+
+[[autodoc]] maybe_apply_chat_template
+
+[[autodoc]] extract_prompt
+
+[[autodoc]] maybe_extract_prompt
+
+[[autodoc]] unpair_preference_dataset
+
+[[autodoc]] maybe_unpair_preference_dataset
diff --git a/docs/source/dataset_formats.mdx b/docs/source/dataset_formats.mdx
diff --git a/examples/datasets/anthropic_hh.py b/examples/datasets/anthropic_hh.py
diff --git a/examples/datasets/hh-rlhf-helpful-base.py b/examples/datasets/hh-rlhf-helpful-base.py
@@ -0,0 +1,82 @@
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+from datasets import load_dataset
+from transformers import HfArgumentParser
+
+
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the dataset to the Hugging Face Hub.
+        repo_id (`str`, *optional*, defaults to `"trl-lib/hh-rlhf-helpful-base"`):
+            Hugging Face repository ID to push the dataset to.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of workers to use for dataset processing.
+    """
+
+    push_to_hub: bool = False
+    repo_id: str = "trl-lib/hh-rlhf-helpful-base"
+    dataset_num_proc: Optional[int] = None
+
+
+def common_start(str1: str, str2: str) -> str:
+    # Zip the two strings and iterate over them together
+    common_chars = []
+    for c1, c2 in zip(str1, str2):
+        if c1 == c2:
+            common_chars.append(c1)
+        else:
+            break
+    # Join the common characters and return as a string
+    return "".join(common_chars)
+
+
+def extract_dialogue(example: str) -> List[Dict[str, str]]:
+    # Extract the prompt, which corresponds to the common start of the chosen and rejected dialogues
+    prompt_text = common_start(example["chosen"], example["rejected"])
+
+    # The chosen and rejected may share a common start, so we need to remove the common part
+    if not prompt_text.endswith("\n\nAssistant: "):
+        prompt_text = prompt_text[: prompt_text.rfind("\n\nAssistant: ")] + "\n\nAssistant: "
+
+    # Extract the chosen and rejected lines
+    chosen_line = example["chosen"][len(prompt_text) :]
+    rejected_line = example["rejected"][len(prompt_text) :]
+
+    # Remove the generation prompt ("\n\nAssistant: ") from the prompt
+    prompt_text = prompt_text[: -len("\n\nAssistant: ")]
+
+    # Split the string at every occurrence of "Human: " or "Assistant: "
+    prompt_lines = re.split(r"(\n\nAssistant: |\n\nHuman: )", prompt_text)
+
+    # Remove the first element as it's empty
+    prompt_lines = prompt_lines[1:]
+
+    prompt = []
+    for idx in range(0, len(prompt_lines), 2):
+        role = "user" if prompt_lines[idx] == "\n\nHuman: " else "assistant"
+        content = prompt_lines[idx + 1]
+        prompt.append({"role": role, "content": content})
+
+    # Remove the prompt from the chosen and rejected dialogues
+    chosen = [{"role": "assitant", "content": chosen_line}]
+    rejected = [{"role": "assistant", "content": rejected_line}]
+
+    return {"prompt": prompt, "chosen": chosen, "rejected": rejected}
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(ScriptArguments)
+    args = parser.parse_args_into_dataclasses()[0]
+
+    dataset = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")
+    dataset = dataset.map(extract_dialogue, num_proc=args.dataset_num_proc)
+
+    if args.push_to_hub:
+        dataset.push_to_hub(args.repo_id)
diff --git a/examples/datasets/lm-human-preferences-descriptiveness.py b/examples/datasets/lm-human-preferences-descriptiveness.py
@@ -0,0 +1,67 @@
+from dataclasses import dataclass
+from typing import Optional
+
+from datasets import load_dataset
+from transformers import AutoTokenizer, HfArgumentParser
+
+
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the dataset to the Hugging Face Hub.
+        repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-descriptiveness"`):
+            Hugging Face repository ID to push the dataset to.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of workers to use for dataset processing.
+    """
+
+    push_to_hub: bool = False
+    repo_id: str = "trl-lib/lm-human-preferences-descriptiveness"
+    dataset_num_proc: Optional[int] = None
+
+
+# Edge cases handling: remove the cases where all samples are the same
+def samples_not_all_same(example):
+    return not all(example["sample0"] == example[f"sample{j}"] for j in range(1, 4))
+
+
+def to_prompt_completion(example, tokenizer):
+    prompt = tokenizer.decode(example["query"]).strip()
+    best_idx = example["best"]
+    chosen = tokenizer.decode(example[f"sample{best_idx}"])
+    for rejected_idx in range(4):  # take the first rejected sample that is different from the chosen one
+        rejected = tokenizer.decode(example[f"sample{rejected_idx}"])
+        if chosen != rejected:
+            break
+    assert chosen != rejected
+    return {"prompt": prompt, "chosen": chosen, "rejected": rejected}
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(ScriptArguments)
+    args = parser.parse_args_into_dataclasses()[0]
+
+    dataset = load_dataset(
+        "json",
+        data_files="https://openaipublic.blob.core.windows.net/lm-human-preferences/labels/descriptiveness/offline_5k.json",
+        split="train",
+    )
+
+    dataset = dataset.filter(samples_not_all_same, num_proc=args.dataset_num_proc)
+
+    dataset = dataset.map(
+        to_prompt_completion,
+        num_proc=args.dataset_num_proc,
+        remove_columns=["query", "sample0", "sample1", "sample2", "sample3", "best"],
+        fn_kwargs={"tokenizer": AutoTokenizer.from_pretrained("gpt2")},
+    )
+
+    # train_size taken from https://github.com/openai/lm-human-preferences/blob/cbfd210bb8b08f6bc5c26878c10984b90f516c66/launch.py#L79)
+    dataset = dataset.train_test_split(train_size=4992)
+
+    if args.push_to_hub:
+        dataset.push_to_hub(args.repo_id)
diff --git a/examples/datasets/lm-human-preferences-sentiment.py b/examples/datasets/lm-human-preferences-sentiment.py
@@ -0,0 +1,60 @@
+from dataclasses import dataclass
+from typing import Optional
+
+from datasets import load_dataset
+from transformers import AutoTokenizer, HfArgumentParser
+
+
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the dataset to the Hugging Face Hub.
+        repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-sentiment"`):
+            Hugging Face repository ID to push the dataset to.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of workers to use for dataset processing.
+    """
+
+    push_to_hub: bool = False
+    repo_id: str = "trl-lib/lm-human-preferences-sentiment"
+    dataset_num_proc: Optional[int] = None
+
+
+def to_prompt_completion(example, tokenizer):
+    prompt = tokenizer.decode(example["query"]).strip()
+    best_idx = example["best"]
+    chosen = tokenizer.decode(example[f"sample{best_idx}"])
+    for rejected_idx in range(4):  # take the first rejected sample that is different from the chosen one
+        rejected = tokenizer.decode(example[f"sample{rejected_idx}"])
+        if chosen != rejected:
+            break
+    assert chosen != rejected
+    return {"prompt": prompt, "chosen": chosen, "rejected": rejected}
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(ScriptArguments)
+    args = parser.parse_args_into_dataclasses()[0]
+
+    dataset = load_dataset(
+        "json",
+        data_files="https://openaipublic.blob.core.windows.net/lm-human-preferences/labels/sentiment/offline_5k.json",
+        split="train",
+    )
+
+    dataset = dataset.map(
+        to_prompt_completion,
+        num_proc=args.dataset_num_proc,
+        remove_columns=["query", "sample0", "sample1", "sample2", "sample3", "best"],
+        fn_kwargs={"tokenizer": AutoTokenizer.from_pretrained("gpt2")},
+    )
+
+    # train_size taken from https://github.com/openai/lm-human-preferences/blob/cbfd210bb8b08f6bc5c26878c10984b90f516c66/launch.py#L70)
+    dataset = dataset.train_test_split(train_size=4992)
+
+    if args.push_to_hub:
+        dataset.push_to_hub(args.repo_id)