from datasets import load_dataset, DatasetDict, concatenate_datasets ### 1번 데이터 ds = load_dataset('kihoonlee/Korean-IC2024-processed', trust_remote_code=True) train_ds = ds["train"] val_ds = ds["dev"] test_ds = ds["test"] print(train_ds) print(val_ds) print(test_ds) def transform_dataset1(dataset): def transform_example(example): return { 'source': 'Korean-IC2024', 'questions': "이미지를 설명해주세요.", 'answers': example['output'], 'images': example['image'] } return dataset.map(transform_example, batch_size=10000) a = transform_dataset1(train_ds) b = transform_dataset1(val_ds) c = transform_dataset1(test_ds) print(a) print(b) print(c) ### 2번 데이터 ds = load_dataset('kihoonlee/KoLLaVA-Instruct-313k', trust_remote_code=True) d = ds["train"].map(lambda example: {**example, 'questions': example['questions'].replace('', '').strip()}, batch_size=10000) print(d) ### 3번 데이터 ds = load_dataset('kms7530/ko-coco-bal', trust_remote_code=True) train_ds = ds["train"] val_ds = ds["validation"] print(train_ds) print(val_ds) def transform_dataset2(dataset): def transform_example(example): return { 'source': 'kms7530/ko-coco-bal', 'questions': "이미지를 설명해주세요.", 'answers': example['caption_ko'], 'images': example['image'] } return dataset.map(transform_example, batch_size=10000) e = transform_dataset2(train_ds) f = transform_dataset2(val_ds) print(e) print(f) ### Merge & Upload all_data = concatenate_datasets([a,b,c,d,e,f]) all_dataset = DatasetDict({"train": all_data}) all_dataset.push_to_hub("kihoonlee/vlm_kotuning_dataset", private=True) print("Dataset uploaded to Hugging Face Hub successfully.")