Commit 29a12d01 authored by kihoon.lee's avatar kihoon.lee
Browse files

upload

parent c9bfd263
from datasets import load_dataset, DatasetDict, concatenate_datasets
### 1번 데이터
ds = load_dataset('kihoonlee/Korean-IC2024-processed', trust_remote_code=True)
train_ds = ds["train"]
val_ds = ds["dev"]
test_ds = ds["test"]
print(train_ds)
print(val_ds)
print(test_ds)
def transform_dataset1(dataset):
def transform_example(example):
return {
'source': 'Korean-IC2024',
'questions': "이미지를 설명해주세요.",
'answers': example['output'],
'images': example['image']
}
return dataset.map(transform_example, batch_size=10000)
a = transform_dataset1(train_ds)
b = transform_dataset1(val_ds)
c = transform_dataset1(test_ds)
print(a)
print(b)
print(c)
### 2번 데이터
ds = load_dataset('kihoonlee/KoLLaVA-Instruct-313k', trust_remote_code=True)
d = ds["train"].map(lambda example: {**example, 'questions': example['questions'].replace('<image>', '').strip()}, batch_size=10000)
print(d)
### 3번 데이터
ds = load_dataset('kms7530/ko-coco-bal', trust_remote_code=True)
train_ds = ds["train"]
val_ds = ds["validation"]
print(train_ds)
print(val_ds)
def transform_dataset2(dataset):
def transform_example(example):
return {
'source': 'kms7530/ko-coco-bal',
'questions': "이미지를 설명해주세요.",
'answers': example['caption_ko'],
'images': example['image']
}
return dataset.map(transform_example, batch_size=10000)
e = transform_dataset2(train_ds)
f = transform_dataset2(val_ds)
print(e)
print(f)
### Merge & Upload
all_data = concatenate_datasets([a,b,c,d,e,f])
all_dataset = DatasetDict({"train": all_data})
all_dataset.push_to_hub("kihoonlee/vlm_kotuning_dataset", private=True)
print("Dataset uploaded to Hugging Face Hub successfully.")
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment