Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
kihoon.lee
dataset
Commits
b047b412
Commit
b047b412
authored
Aug 08, 2024
by
kihoon.lee
Browse files
update
parent
1806ca21
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
question_recommendation/QR_v1.5.json
View file @
b047b412
This diff is collapsed.
Click to expand it.
question_recommendation/QR_v1.5_1.json
0 → 100644
View file @
b047b412
This diff is collapsed.
Click to expand it.
question_recommendation/formatting.py
0 → 100644
View file @
b047b412
import
json
def
clean_recommendations
(
file_path
,
new_file_path
):
#추천질의 시작이 "1.~~"이 아닌 "물론입니다. 1.~~" 같은 데이터가 있을경우 번호 앞에 내용을 전부 제거함
with
open
(
file_path
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
data
=
json
.
load
(
file
)
count
=
0
modified_count
=
0
for
entry
in
data
:
if
'추천질의'
in
entry
and
entry
[
'추천질의'
]:
questions
=
entry
[
'추천질의'
].
split
(
'
\n
'
)
updated_questions
=
[]
for
i
,
question
in
enumerate
(
questions
,
start
=
1
):
new_question
=
f
"
{
i
}
.
{
question
.
split
(
'. '
,
1
)[
-
1
]
}
"
if
new_question
!=
question
:
modified_count
+=
1
updated_questions
.
append
(
new_question
)
entry
[
'추천질의'
]
=
'
\n
'
.
join
(
updated_questions
)
count
+=
1
with
open
(
new_file_path
,
'w'
,
encoding
=
'utf-8'
)
as
file
:
json
.
dump
(
data
,
file
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"
\033
[94mclean_recommendations 완료. 포매팅된 개수:
{
modified_count
}
/
{
count
}
\033
[0m"
)
def
trim_long_recommendations
(
file_path
,
new_file_path
):
# 추천질의가 4개 이상인 것들은 3개까지만 저장되도록 포매팅
with
open
(
file_path
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
data
=
json
.
load
(
file
)
count
=
0
modified_count
=
0
for
entry
in
data
:
if
'추천질의'
in
entry
and
entry
[
'추천질의'
]:
questions
=
entry
[
'추천질의'
].
split
(
'
\n
'
)
updated_questions
=
[]
for
i
,
question
in
enumerate
(
questions
,
start
=
1
):
if
i
>
3
and
len
(
question
.
split
(
'. '
))
>
1
:
break
updated_questions
.
append
(
question
)
if
len
(
updated_questions
)
!=
len
(
questions
):
modified_count
+=
1
entry
[
'추천질의'
]
=
'
\n
'
.
join
(
updated_questions
)
count
+=
1
with
open
(
new_file_path
,
'w'
,
encoding
=
'utf-8'
)
as
file
:
json
.
dump
(
data
,
file
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"
\033
[94mtrim_long_recommendations 완료. 변경된 데이터 개수:
{
modified_count
}
/
{
count
}
\033
[0m"
)
if
__name__
==
"__main__"
:
file_path
=
'/dataset/question_recommendation/QR_v1.5.json'
new_file_path
=
'/dataset/question_recommendation/QR_v1.5_1.json'
clean_recommendations
(
file_path
,
new_file_path
)
trim_long_recommendations
(
new_file_path
,
new_file_path
)
print
(
"
\033
[94m
\n
DONE!
\033
[0m"
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment