Open
Description
In the windows environment, the following problems occur when extracting the WIKI corpus:
INFO: Starting page extraction from zhwiki-20240301-pages-articles-multistream.xml.bz2.
Traceback (most recent call last):
File "D:\ProgramData\anaconda3\envs\myenv\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "D:\ProgramData\anaconda3\envs\myenv\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "D:\ProgramData\anaconda3\envs\myenv\lib\site-packages\wikiextractor\WikiExtractor.py", line 643, in <module>
main()
File "D:\ProgramData\anaconda3\envs\myenv\lib\site-packages\wikiextractor\WikiExtractor.py", line 639, in main
process_dump(input_file, args.templates, output_path, file_size,
File "D:\ProgramData\anaconda3\envs\myenv\lib\site-packages\wikiextractor\WikiExtractor.py", line 417, in process_dump
Process = get_context("fork").Process
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\context.py", line 243, in get_context
return super().get_context(method)
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\context.py", line 193, in get_context
raise ValueError('cannot find context for %r' % method) from None
ValueError: cannot find context for 'fork'
dit: wikiextractor\WikiExtractor.py line 417
Process = get_context("fork").Process
->
Process = get_context("spawn").Process
A new problem arises:
INFO: Starting page extraction from zhwiki-20240301-pages-articles-multistream.xml.bz2.
Traceback (most recent call last):
File "D:\ProgramData\anaconda3\envs\myenv\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "D:\ProgramData\anaconda3\envs\myenv\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "D:\ProgramData\anaconda3\envs\myenv\Scripts\wikiextractor.exe\__main__.py", line 7, in <module>
File "D:\ProgramData\anaconda3\envs\myenv\lib\site-packages\wikiextractor\WikiExtractor.py", line 639, in main
process_dump(input_file, args.templates, output_path, file_size,
File "D:\ProgramData\anaconda3\envs\myenv\lib\site-packages\wikiextractor\WikiExtractor.py", line 425, in process_dump
reduce.start()
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\context.py", line 336, in _Popen
return Popen(process_obj)
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\popen_spawn_win32.py", line 93, in __init__
reduction.dump(process_obj, to_child)
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: cannot pickle '_io.TextIOWrapper' object
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\ProgramData\anaconda3\envs\myenv\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
EOFError: Ran out of input
Metadata
Metadata
Assignees
Labels
No labels