merge_and_upload.py 1.78 KB
Newer Older
kihoon.lee's avatar
upload  
kihoon.lee committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from datasets import load_dataset, DatasetDict, concatenate_datasets

### 1번 데이터
ds = load_dataset('kihoonlee/Korean-IC2024-processed', trust_remote_code=True)
train_ds = ds["train"]
val_ds = ds["dev"]
test_ds = ds["test"]

print(train_ds)
print(val_ds)
print(test_ds)

def transform_dataset1(dataset):
    def transform_example(example):
        return {
            'source': 'Korean-IC2024',
            'questions': "이미지를 설명해주세요.",
            'answers': example['output'],
            'images': example['image']
        }
    return dataset.map(transform_example, batch_size=10000)

a = transform_dataset1(train_ds)
b = transform_dataset1(val_ds)
c = transform_dataset1(test_ds)

print(a)
print(b)
print(c)

### 2번 데이터
ds = load_dataset('kihoonlee/KoLLaVA-Instruct-313k', trust_remote_code=True)
d = ds["train"].map(lambda example: {**example, 'questions': example['questions'].replace('<image>', '').strip()}, batch_size=10000)

print(d)

### 3번 데이터
ds = load_dataset('kms7530/ko-coco-bal', trust_remote_code=True)
train_ds = ds["train"]
val_ds = ds["validation"]
print(train_ds)
print(val_ds)

def transform_dataset2(dataset):
    def transform_example(example):
        return {
            'source': 'kms7530/ko-coco-bal',
            'questions': "이미지를 설명해주세요.",
            'answers': example['caption_ko'],
            'images': example['image']
        }
    return dataset.map(transform_example, batch_size=10000)

e = transform_dataset2(train_ds)
f = transform_dataset2(val_ds)

print(e)
print(f)

### Merge & Upload
all_data = concatenate_datasets([a,b,c,d,e,f])
all_dataset = DatasetDict({"train": all_data})

all_dataset.push_to_hub("kihoonlee/vlm_kotuning_dataset", private=True)

print("Dataset uploaded to Hugging Face Hub successfully.")