Bug with txt_parser.cpp #167

FlyDre · 2025-01-24T01:02:04Z

void TXTParser::parse(const data_source& data)
{
	docwire_log(debug) << "Using TXT parser.";
	std::string text;
	csd_t charset_detector = NULL;
	htmlcxx::CharsetConverter* converter = NULL;
	try
	{
		std::string encoding;
		std::string content = data.string();
		charset_detector = csd_open();
		if (charset_detector == (csd_t)-1)
		{
			charset_detector = NULL;
			sendTag(make_error_ptr("Could not create charset detector"));
			encoding = "UTF-8";
		}
		else
		{
			csd_consider(charset_detector, content.c_str(), content.length());
			const char* res = csd_close(charset_detector);
			charset_detector = NULL;
			if (res != NULL)
			{
				encoding = std::string(res);
				docwire_log(debug) << "Estimated encoding: " + encoding;
			}
			else
			{
				encoding = "ASCII";
				docwire_log(debug) << "Could not detect encoding. Document is assumed to be encoded in ASCII";
				docwire_log(debug) << "But it can be also binary. Sequences of printable characters will be extracted.";
				content = sequences_of_printable_characters(content);
			}
		}
		if (encoding != "utf-8" && encoding != "UTF-8")
		{
			try
			{
				converter = new htmlcxx::CharsetConverter(encoding, "UTF-8");
			}
			catch (htmlcxx::CharsetConverter::Exception& ex)
			{
				sendTag(make_nested_ptr(ex, make_error("Cannot convert text to UTF-8", encoding)));
				if (converter)
					delete converter;
				converter = NULL;
			}
		}
		if (converter)
		{
			text = converter->convert(content);
			delete converter;
			converter = NULL;
		}
		else
			text = content;
	}
	catch (const std::exception& e)
	{
		if (converter)
			delete converter;
		converter = NULL;
		if (charset_detector)
			csd_close(charset_detector);
		charset_detector = NULL;
		std::throw_with_nested(make_error("Error converting text to UTF-8"));
	}
	bool parse_paragraphs = impl().m_parse_paragraphs.v;
	bool parse_lines = impl().m_parse_lines.v;
	sendTag(tag::Document{});
	if (parse_lines || parse_paragraphs)
	{
		std::string::size_type curr_pos = 0;
		enum { outside_paragraph, empty_paragraph, filled_paragraph } paragraph_state = outside_paragraph;
		std::string last_eol = "";
		for (;;)
		{
			std::string::size_type eol_pos = text.find_first_of("\r\n", curr_pos);
			std::string eol = (eol_pos == std::string::npos ? std::string{""} : text.substr(eol_pos, 1));
			if (eol == "\r" && eol_pos + 1 < text.size() && text[eol_pos + 1] == '\n')
				eol += '\n';
			std::string line = text.substr(curr_pos, eol_pos - curr_pos);
			if (parse_paragraphs)
			{
				if (paragraph_state == outside_paragraph)
				{
					sendTag(tag::Paragraph{});
					paragraph_state = empty_paragraph;
				}
				if (line.empty())
				{
					sendTag(tag::CloseParagraph{});
					paragraph_state = outside_paragraph;
				}
				else
				{
					if (paragraph_state == filled_paragraph)
					{
						if (parse_lines)
							sendTag(tag::BreakLine{});
						else
							sendTag(tag::Text{.text = last_eol});
					}
					sendTag(tag::Text{.text = line});
					paragraph_state = filled_paragraph;
				}
			}
			else
			{
				if (!line.empty())
					sendTag(tag::Text{.text = line});
				if (!eol.empty())
				{
					if (parse_lines)
						sendTag(tag::BreakLine{});
					else
						sendTag(tag::Text{.text = eol});
				}
			}
			if (eol.empty())
				break;
			curr_pos = eol_pos + eol.size();
			last_eol = eol;
		}
		if (parse_paragraphs && paragraph_state != outside_paragraph)
			sendTag(tag::CloseParagraph{});
	}
	else
		sendTag(tag::Text{.text = text});
	sendTag(tag::CloseDocument{});
}

} // namespace docwire

txt_parser.cpp

Bug with parsing .txt files: The showed content from .txt files is always missing some '\n'.
(e.g. when the file is:
1.hello(\r)\n
2.(\r)\n
3.(\r)\n
4.world
and the output is:
1.hello\n
2.\n
3.world
4.\n
5.\n
) Miss a EOL between line-"hello" and line-"world".

And the revised codes for txt_parser.cpp below should work.

if (parse_paragraphs)
			{
				if (paragraph_state == outside_paragraph)
				{
					sendTag(tag::Paragraph{});
					paragraph_state = empty_paragraph;
				}
				if (line.empty())
				{
// ----------------------------------------------------------------------------
                                        if (paragraph_state == filled_paragraph) {
						sendTag(tag::Text{.text = last_eol});
					}
// ----------------------------------------------------------------------------
					sendTag(tag::CloseParagraph{});
					paragraph_state = outside_paragraph;
				}

The text was updated successfully, but these errors were encountered:

as-ascii · 2025-01-27T16:33:29Z

Thank you very much for your analysis and the fix. We probably add more automatic tests around this topic in the continuous integration procedure and apply the fix in the next release.

Best regards!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bug with txt_parser.cpp #167

Bug with txt_parser.cpp #167

Bug with txt_parser.cpp #167

Bug with txt_parser.cpp #167

Comments