8000 Add scripts to convert raw file to text-only json by research4pan · Pull Request #638 · OptimalScale/LMFlow · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add scripts to convert raw file to text-only json #638

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions scripts/data_preprocess/raw2textonly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
"""
Converts a raw text file, separated by lines, into a "text-only" formatted json.
"""
from __future__ import absolute_import

import argparse
import json
import textwrap
import sys

def parse_argument(sys_argv):
"""Parses arguments from command line.
Args:
sys_argv: the list of arguments (strings) from command line.
Returns:
A struct whose member corresponds to the required (optional) variable.
For example,
```
args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
args.input # 'a.txt'
args.num # 10
```
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter)

# Training parameters
parser.add_argument(
"--dataset_path", type=str,
default=None,
help=textwrap.dedent("input dataset path, reads from stdin by default")
)
parser.add_argument(
"--output_path", type=str,
default=None,
help=textwrap.dedent("output dataset path, writes to stdout by default")
)

# Parses from commandline
args = parser.parse_args(sys_argv[1:])

return args


def raw2textonly(fin):
"""
Converts raw text to text-only format.

Args:
fin: the input file description of the raw text file.
Returns:
a dict with "text-only" format.
"""
data_dict = {
"type": "text_only",
"instances": [ { "text": line.strip() } for line in fin ],
}
return data_dict


def main():
args = parse_argument(sys.argv)

if args.dataset_path is not None:
with open(args.dataset_path, "r") as fin:
data_dict = raw2textonly(fin)
else:
data_dict = raw2textonly(sys.stdin)

if args.output_path is not None:
with open(args.output_path, "w") as fout:
json.dump(data_dict, fout, indent=4, ensure_ascii=False)
else:
json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)


if __name__ == "__main__":
main()
0