Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
kihoon.lee
dataset
Commits
88698e92
Commit
88698e92
authored
Aug 14, 2024
by
kihoon.lee
Browse files
upload
parent
97677665
Changes
3
Hide whitespace changes
Inline
Side-by-side
good-korean-dataset-QA/README.md
0 → 100644
View file @
88698e92
kms7530/koalphaca-orca-for-solar
CarrotAI/ko-instruction-dataset
HAERAE-HUB/Korean-Human-Judgements
\ No newline at end of file
good-korean-dataset-QA/good-korean-dataset-QA.json
0 → 100644
View file @
88698e92
This source diff could not be displayed because it is too large. You can
view the blob
instead.
good-korean-dataset-QA/making-and-push.py
0 → 100644
View file @
88698e92
from
datasets
import
load_dataset
,
DatasetDict
,
Dataset
import
json
import
os
from
huggingface_hub
import
HfApi
,
HfFolder
def
add01
():
dataset
=
load_dataset
(
"HAERAE-HUB/Korean-Human-Judgements"
)
new_dataset
=
{
"questions"
:
[],
"answers"
:
[],
"source"
:
[]
}
for
data
in
dataset
[
'train'
]:
new_dataset
[
"questions"
].
append
(
data
[
"instruction"
])
if
data
[
"decision"
]
in
[
"A"
,
"Tie"
]:
new_dataset
[
"answers"
].
append
(
data
[
"response_a"
])
else
:
new_dataset
[
"answers"
].
append
(
data
[
"response_b"
])
new_dataset
[
"source"
].
append
(
data
[
"source"
])
new_dataset_dict
=
DatasetDict
({
"train"
:
Dataset
.
from_dict
(
new_dataset
)})
new_dataset_dict
.
push_to_hub
(
"kihoonlee/good-korean-dataset-QA"
)
def
add02
(
base_data_path
=
'kihoonlee/good-korean-dataset-QA'
):
print
(
"run"
)
dataset
=
load_dataset
(
"CarrotAI/ko-instruction-dataset"
)
base_dataset
=
load_dataset
(
base_data_path
)[
'train'
]
new_dataset
=
base_dataset
.
to_dict
()
combined_data
=
[]
for
i
in
range
(
len
(
new_dataset
[
'questions'
])):
combined_data
.
append
({
"questions"
:
new_dataset
[
'questions'
][
i
],
"answers"
:
new_dataset
[
'answers'
][
i
],
"source"
:
new_dataset
[
'source'
][
i
]
})
for
data
in
dataset
[
'train'
]:
combined_data
.
append
({
"questions"
:
data
[
"instruction"
],
"answers"
:
data
[
"output"
],
"source"
:
'CarrotAI/ko-instruction-dataset'
})
new_dataset
[
"questions"
].
append
(
data
[
"instruction"
])
new_dataset
[
"answers"
].
append
(
data
[
"output"
])
new_dataset
[
"source"
].
append
(
'CarrotAI/ko-instruction-dataset'
)
with
open
(
'good-korean-dataset-QA.json'
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
combined_data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
new_dataset_dict
=
DatasetDict
({
"train"
:
Dataset
.
from_dict
(
new_dataset
)})
new_dataset_dict
.
push_to_hub
(
"kihoonlee/good-korean-dataset-QA"
)
print
(
"done"
)
def
add03
(
base_data_path
=
'kihoonlee/good-korean-dataset-QA'
):
print
(
"run"
)
dataset
=
load_dataset
(
"kms7530/koalphaca-orca-for-solar"
)
base_dataset
=
load_dataset
(
base_data_path
)[
'train'
]
new_dataset
=
base_dataset
.
to_dict
()
combined_data
=
[]
for
i
in
range
(
len
(
new_dataset
[
'questions'
])):
combined_data
.
append
({
"questions"
:
new_dataset
[
'questions'
][
i
],
"answers"
:
new_dataset
[
'answers'
][
i
],
"source"
:
new_dataset
[
'source'
][
i
]
})
for
data
in
dataset
[
'train'
]:
combined_data
.
append
({
"questions"
:
data
[
"formated_inst"
].
split
(
"### Assistant:"
)[
0
].
strip
(
"### User:"
).
strip
(),
"answers"
:
data
[
"formated_inst"
].
split
(
"### Assistant:"
)[
1
].
strip
(),
"source"
:
'kms7530/koalphaca-orca-for-solar'
})
new_dataset
[
"questions"
].
append
(
data
[
"formated_inst"
].
split
(
"### Assistant:"
)[
0
].
strip
(
"### User:"
).
strip
())
new_dataset
[
"answers"
].
append
(
data
[
"formated_inst"
].
split
(
"### Assistant:"
)[
1
].
strip
())
new_dataset
[
"source"
].
append
(
'kms7530/koalphaca-orca-for-solar'
)
with
open
(
'good-korean-dataset-QA.json'
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
combined_data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
new_dataset_dict
=
DatasetDict
({
"train"
:
Dataset
.
from_dict
(
new_dataset
)})
new_dataset_dict
.
push_to_hub
(
"kihoonlee/good-korean-dataset-QA"
)
print
(
"done"
)
if
__name__
==
"__main__"
:
add01
()
add02
()
add03
()
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment