Data
In [ ]:
Copied!
import pandas as pd
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset
In [ ]:
Copied!
data = load_dataset("Fredithefish/Instruction-Tuning-with-GPT-4-RedPajama-Chat")
data
data = load_dataset("Fredithefish/Instruction-Tuning-with-GPT-4-RedPajama-Chat")
data
In [ ]:
Copied!
df = data["train"].to_pandas()
df
df = data["train"].to_pandas()
df
In [ ]:
Copied!
df["Assistant"] = df["text"].apply(lambda x: x.split("<bot>:")[-1])
df["Human"] = df["text"].apply(lambda x: "Human:" + x.split("<bot>:")[0].replace("<human>:", "").replace("\n", "") + ". Assistant: ")
df
df["Assistant"] = df["text"].apply(lambda x: x.split(":")[-1])
df["Human"] = df["text"].apply(lambda x: "Human:" + x.split(":")[0].replace(":", "").replace("\n", "") + ". Assistant: ")
df
In [ ]:
Copied!
df = df.sample(20000)
df
df = df.sample(20000)
df
In [ ]:
Copied!
df = df[["Human", "Assistant"]]
df = df[["Human", "Assistant"]]
In [ ]:
Copied!
df.iloc[0]["Human"]
df.iloc[0]["Human"]
In [ ]:
Copied!
df[:15000].to_parquet("train.parquet", index=False)
df[15000:].to_parquet("test.parquet", index=False)
df[:15000].to_parquet("train.parquet", index=False)
df[15000:].to_parquet("test.parquet", index=False)
In [ ]:
Copied!
df.to_parquet("chat_data.parquet", index=False)
df.to_parquet("chat_data.parquet", index=False)
In [ ]:
Copied!