You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
void TXTParser::parse(const data_source& data)
{
docwire_log(debug) << "Using TXT parser.";
std::string text;
csd_t charset_detector = NULL;
htmlcxx::CharsetConverter* converter = NULL;
try
{
std::string encoding;
std::string content = data.string();
charset_detector = csd_open();
if (charset_detector == (csd_t)-1)
{
charset_detector = NULL;
sendTag(make_error_ptr("Could not create charset detector"));
encoding = "UTF-8";
}
else
{
csd_consider(charset_detector, content.c_str(), content.length());
const char* res = csd_close(charset_detector);
charset_detector = NULL;
if (res != NULL)
{
encoding = std::string(res);
docwire_log(debug) << "Estimated encoding: " + encoding;
}
else
{
encoding = "ASCII";
docwire_log(debug) << "Could not detect encoding. Document is assumed to be encoded in ASCII";
docwire_log(debug) << "But it can be also binary. Sequences of printable characters will be extracted.";
content = sequences_of_printable_characters(content);
}
}
if (encoding != "utf-8" && encoding != "UTF-8")
{
try
{
converter = new htmlcxx::CharsetConverter(encoding, "UTF-8");
}
catch (htmlcxx::CharsetConverter::Exception& ex)
{
sendTag(make_nested_ptr(ex, make_error("Cannot convert text to UTF-8", encoding)));
if (converter)
delete converter;
converter = NULL;
}
}
if (converter)
{
text = converter->convert(content);
delete converter;
converter = NULL;
}
else
text = content;
}
catch (const std::exception& e)
{
if (converter)
delete converter;
converter = NULL;
if (charset_detector)
csd_close(charset_detector);
charset_detector = NULL;
std::throw_with_nested(make_error("Error converting text to UTF-8"));
}
bool parse_paragraphs = impl().m_parse_paragraphs.v;
bool parse_lines = impl().m_parse_lines.v;
sendTag(tag::Document{});
if (parse_lines || parse_paragraphs)
{
std::string::size_type curr_pos = 0;
enum { outside_paragraph, empty_paragraph, filled_paragraph } paragraph_state = outside_paragraph;
std::string last_eol = "";
for (;;)
{
std::string::size_type eol_pos = text.find_first_of("\r\n", curr_pos);
std::string eol = (eol_pos == std::string::npos ? std::string{""} : text.substr(eol_pos, 1));
if (eol == "\r" && eol_pos + 1 < text.size() && text[eol_pos + 1] == '\n')
eol += '\n';
std::string line = text.substr(curr_pos, eol_pos - curr_pos);
if (parse_paragraphs)
{
if (paragraph_state == outside_paragraph)
{
sendTag(tag::Paragraph{});
paragraph_state = empty_paragraph;
}
if (line.empty())
{
sendTag(tag::CloseParagraph{});
paragraph_state = outside_paragraph;
}
else
{
if (paragraph_state == filled_paragraph)
{
if (parse_lines)
sendTag(tag::BreakLine{});
else
sendTag(tag::Text{.text = last_eol});
}
sendTag(tag::Text{.text = line});
paragraph_state = filled_paragraph;
}
}
else
{
if (!line.empty())
sendTag(tag::Text{.text = line});
if (!eol.empty())
{
if (parse_lines)
sendTag(tag::BreakLine{});
else
sendTag(tag::Text{.text = eol});
}
}
if (eol.empty())
break;
curr_pos = eol_pos + eol.size();
last_eol = eol;
}
if (parse_paragraphs && paragraph_state != outside_paragraph)
sendTag(tag::CloseParagraph{});
}
else
sendTag(tag::Text{.text = text});
sendTag(tag::CloseDocument{});
}
} // namespace docwire
txt_parser.cpp
Bug with parsing .txt files: The showed content from .txt files is always missing some '\n'.
(e.g. when the file is:
1.hello(\r)\n
2.(\r)\n
3.(\r)\n
4.world
and the output is:
1.hello\n
2.\n
3.world
4.\n
5.\n
) Miss a EOL between line-"hello" and line-"world".
And the revised codes for txt_parser.cpp below should work.
if (parse_paragraphs)
{
if (paragraph_state == outside_paragraph)
{
sendTag(tag::Paragraph{});
paragraph_state = empty_paragraph;
}
if (line.empty())
{
// ----------------------------------------------------------------------------
if (paragraph_state == filled_paragraph) {
sendTag(tag::Text{.text = last_eol});
}
// ----------------------------------------------------------------------------
sendTag(tag::CloseParagraph{});
paragraph_state = outside_paragraph;
}
The text was updated successfully, but these errors were encountered:
Thank you very much for your analysis and the fix. We probably add more automatic tests around this topic in the continuous integration procedure and apply the fix in the next release.
Bug with parsing .txt files: The showed content from .txt files is always missing some '\n'.
(e.g. when the file is:
1.hello(\r)\n
2.(\r)\n
3.(\r)\n
4.world
and the output is:
1.hello\n
2.\n
3.world
4.\n
5.\n
) Miss a EOL between line-"hello" and line-"world".
And the revised codes for txt_parser.cpp below should work.
The text was updated successfully, but these errors were encountered: