Commit 88698e92 authored by kihoon.lee's avatar kihoon.lee
Browse files

upload

parent 97677665
kms7530/koalphaca-orca-for-solar
CarrotAI/ko-instruction-dataset
HAERAE-HUB/Korean-Human-Judgements
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
from datasets import load_dataset, DatasetDict, Dataset
import json
import os
from huggingface_hub import HfApi, HfFolder
def add01():
dataset = load_dataset("HAERAE-HUB/Korean-Human-Judgements")
new_dataset = {
"questions": [],
"answers": [],
"source": []
}
for data in dataset['train']:
new_dataset["questions"].append(data["instruction"])
if data["decision"] in ["A", "Tie"]:
new_dataset["answers"].append(data["response_a"])
else:
new_dataset["answers"].append(data["response_b"])
new_dataset["source"].append(data["source"])
new_dataset_dict = DatasetDict({"train": Dataset.from_dict(new_dataset)})
new_dataset_dict.push_to_hub("kihoonlee/good-korean-dataset-QA")
def add02(base_data_path='kihoonlee/good-korean-dataset-QA'):
print("run")
dataset = load_dataset("CarrotAI/ko-instruction-dataset")
base_dataset = load_dataset(base_data_path)['train']
new_dataset = base_dataset.to_dict()
combined_data = []
for i in range(len(new_dataset['questions'])):
combined_data.append({
"questions": new_dataset['questions'][i],
"answers": new_dataset['answers'][i],
"source": new_dataset['source'][i]
})
for data in dataset['train']:
combined_data.append({
"questions": data["instruction"],
"answers": data["output"],
"source": 'CarrotAI/ko-instruction-dataset'
})
new_dataset["questions"].append(data["instruction"])
new_dataset["answers"].append(data["output"])
new_dataset["source"].append('CarrotAI/ko-instruction-dataset')
with open('good-korean-dataset-QA.json', 'w', encoding='utf-8') as f:
json.dump(combined_data, f, ensure_ascii=False, indent=4)
new_dataset_dict = DatasetDict({"train": Dataset.from_dict(new_dataset)})
new_dataset_dict.push_to_hub("kihoonlee/good-korean-dataset-QA")
print("done")
def add03(base_data_path='kihoonlee/good-korean-dataset-QA'):
print("run")
dataset = load_dataset("kms7530/koalphaca-orca-for-solar")
base_dataset = load_dataset(base_data_path)['train']
new_dataset = base_dataset.to_dict()
combined_data = []
for i in range(len(new_dataset['questions'])):
combined_data.append({
"questions": new_dataset['questions'][i],
"answers": new_dataset['answers'][i],
"source": new_dataset['source'][i]
})
for data in dataset['train']:
combined_data.append({
"questions": data["formated_inst"].split("### Assistant:")[0].strip("### User:").strip(),
"answers": data["formated_inst"].split("### Assistant:")[1].strip(),
"source": 'kms7530/koalphaca-orca-for-solar'
})
new_dataset["questions"].append(data["formated_inst"].split("### Assistant:")[0].strip("### User:").strip())
new_dataset["answers"].append(data["formated_inst"].split("### Assistant:")[1].strip())
new_dataset["source"].append('kms7530/koalphaca-orca-for-solar')
with open('good-korean-dataset-QA.json', 'w', encoding='utf-8') as f:
json.dump(combined_data, f, ensure_ascii=False, indent=4)
new_dataset_dict = DatasetDict({"train": Dataset.from_dict(new_dataset)})
new_dataset_dict.push_to_hub("kihoonlee/good-korean-dataset-QA")
print("done")
if __name__ == "__main__":
add01()
add02()
add03()
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment