from datasets import load_dataset, DatasetDict, Dataset import json import os from huggingface_hub import HfApi, HfFolder def add01(): dataset = load_dataset("HAERAE-HUB/Korean-Human-Judgements") new_dataset = { "questions": [], "answers": [], "source": [] } for data in dataset['train']: new_dataset["questions"].append(data["instruction"]) if data["decision"] in ["A", "Tie"]: new_dataset["answers"].append(data["response_a"]) else: new_dataset["answers"].append(data["response_b"]) new_dataset["source"].append(data["source"]) new_dataset_dict = DatasetDict({"train": Dataset.from_dict(new_dataset)}) new_dataset_dict.push_to_hub("kihoonlee/good-korean-dataset-QA") def add02(base_data_path='kihoonlee/good-korean-dataset-QA'): print("run") dataset = load_dataset("CarrotAI/ko-instruction-dataset") base_dataset = load_dataset(base_data_path)['train'] new_dataset = base_dataset.to_dict() combined_data = [] for i in range(len(new_dataset['questions'])): combined_data.append({ "questions": new_dataset['questions'][i], "answers": new_dataset['answers'][i], "source": new_dataset['source'][i] }) for data in dataset['train']: combined_data.append({ "questions": data["instruction"], "answers": data["output"], "source": 'CarrotAI/ko-instruction-dataset' }) new_dataset["questions"].append(data["instruction"]) new_dataset["answers"].append(data["output"]) new_dataset["source"].append('CarrotAI/ko-instruction-dataset') with open('good-korean-dataset-QA.json', 'w', encoding='utf-8') as f: json.dump(combined_data, f, ensure_ascii=False, indent=4) new_dataset_dict = DatasetDict({"train": Dataset.from_dict(new_dataset)}) new_dataset_dict.push_to_hub("kihoonlee/good-korean-dataset-QA") print("done") def add03(base_data_path='kihoonlee/good-korean-dataset-QA'): print("run") dataset = load_dataset("kms7530/koalphaca-orca-for-solar") base_dataset = load_dataset(base_data_path)['train'] new_dataset = base_dataset.to_dict() combined_data = [] for i in range(len(new_dataset['questions'])): combined_data.append({ "questions": new_dataset['questions'][i], "answers": new_dataset['answers'][i], "source": new_dataset['source'][i] }) for data in dataset['train']: combined_data.append({ "questions": data["formated_inst"].split("### Assistant:")[0].strip("### User:").strip(), "answers": data["formated_inst"].split("### Assistant:")[1].strip(), "source": 'kms7530/koalphaca-orca-for-solar' }) new_dataset["questions"].append(data["formated_inst"].split("### Assistant:")[0].strip("### User:").strip()) new_dataset["answers"].append(data["formated_inst"].split("### Assistant:")[1].strip()) new_dataset["source"].append('kms7530/koalphaca-orca-for-solar') with open('good-korean-dataset-QA.json', 'w', encoding='utf-8') as f: json.dump(combined_data, f, ensure_ascii=False, indent=4) new_dataset_dict = DatasetDict({"train": Dataset.from_dict(new_dataset)}) new_dataset_dict.push_to_hub("kihoonlee/good-korean-dataset-QA") print("done") if __name__ == "__main__": add01() add02() add03()