Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
kihoon.lee
dataset
Commits
29a12d01
Commit
29a12d01
authored
Aug 22, 2024
by
kihoon.lee
Browse files
upload
parent
c9bfd263
Changes
1
Hide whitespace changes
Inline
Side-by-side
vlm/merge_and_upload.py
0 → 100644
View file @
29a12d01
from
datasets
import
load_dataset
,
DatasetDict
,
concatenate_datasets
### 1번 데이터
ds
=
load_dataset
(
'kihoonlee/Korean-IC2024-processed'
,
trust_remote_code
=
True
)
train_ds
=
ds
[
"train"
]
val_ds
=
ds
[
"dev"
]
test_ds
=
ds
[
"test"
]
print
(
train_ds
)
print
(
val_ds
)
print
(
test_ds
)
def
transform_dataset1
(
dataset
):
def
transform_example
(
example
):
return
{
'source'
:
'Korean-IC2024'
,
'questions'
:
"이미지를 설명해주세요."
,
'answers'
:
example
[
'output'
],
'images'
:
example
[
'image'
]
}
return
dataset
.
map
(
transform_example
,
batch_size
=
10000
)
a
=
transform_dataset1
(
train_ds
)
b
=
transform_dataset1
(
val_ds
)
c
=
transform_dataset1
(
test_ds
)
print
(
a
)
print
(
b
)
print
(
c
)
### 2번 데이터
ds
=
load_dataset
(
'kihoonlee/KoLLaVA-Instruct-313k'
,
trust_remote_code
=
True
)
d
=
ds
[
"train"
].
map
(
lambda
example
:
{
**
example
,
'questions'
:
example
[
'questions'
].
replace
(
'<image>'
,
''
).
strip
()},
batch_size
=
10000
)
print
(
d
)
### 3번 데이터
ds
=
load_dataset
(
'kms7530/ko-coco-bal'
,
trust_remote_code
=
True
)
train_ds
=
ds
[
"train"
]
val_ds
=
ds
[
"validation"
]
print
(
train_ds
)
print
(
val_ds
)
def
transform_dataset2
(
dataset
):
def
transform_example
(
example
):
return
{
'source'
:
'kms7530/ko-coco-bal'
,
'questions'
:
"이미지를 설명해주세요."
,
'answers'
:
example
[
'caption_ko'
],
'images'
:
example
[
'image'
]
}
return
dataset
.
map
(
transform_example
,
batch_size
=
10000
)
e
=
transform_dataset2
(
train_ds
)
f
=
transform_dataset2
(
val_ds
)
print
(
e
)
print
(
f
)
### Merge & Upload
all_data
=
concatenate_datasets
([
a
,
b
,
c
,
d
,
e
,
f
])
all_dataset
=
DatasetDict
({
"train"
:
all_data
})
all_dataset
.
push_to_hub
(
"kihoonlee/vlm_kotuning_dataset"
,
private
=
True
)
print
(
"Dataset uploaded to Hugging Face Hub successfully."
)
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment