8000 Bug with txt_parser.cpp · Issue #167 · docwire/docwire · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Bug with txt_parser.cpp #167

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
FlyDre opened this issue Jan 24, 2025 · 1 comment
Open

Bug with txt_parser.cpp #167

FlyDre opened this issue Jan 24, 2025 · 1 comment

Comments

@FlyDre
Copy link
9CBD
FlyDre commented Jan 24, 2025
void TXTParser::parse(const data_source& data)
{
	docwire_log(debug) << "Using TXT parser.";
	std::string text;
	csd_t charset_detector = NULL;
	htmlcxx::CharsetConverter* converter = NULL;
	try
	{
		std::string encoding;
		std::string content = data.string();
		charset_detector = csd_open();
		if (charset_detector == (csd_t)-1)
		{
			charset_detector = NULL;
			sendTag(make_error_ptr("Could not create charset detector"));
			encoding = "UTF-8";
		}
		else
		{
			csd_consider(charset_detector, content.c_str(), content.length());
			const char* res = csd_close(charset_detector);
			charset_detector = NULL;
			if (res != NULL)
			{
				encoding = std::string(res);
				docwire_log(debug) << "Estimated encoding: " + encoding;
			}
			else
			{
				encoding = "ASCII";
				docwire_log(debug) << "Could not detect encoding. Document is assumed to be encoded in ASCII";
				docwire_log(debug) << "But it can be also binary. Sequences of printable characters will be extracted.";
				content = sequences_of_printable_characters(content);
			}
		}
		if (encoding != "utf-8" && encoding != "UTF-8")
		{
			try
			{
				converter = new htmlcxx::CharsetConverter(encoding, "UTF-8");
			}
			catch (htmlcxx::CharsetConverter::Exception& ex)
			{
				sendTag(make_nested_ptr(ex, make_error("Cannot convert text to UTF-8", encoding)));
				if (converter)
					delete converter;
				converter = NULL;
			}
		}
		if (converter)
		{
			text = converter->convert(content);
			delete converter;
			converter = NULL;
		}
		else
			text = content;
	}
	catch (const std::exception& e)
	{
		if (converter)
			delete converter;
		converter = NULL;
		if (charset_detector)
			csd_close(charset_detector);
		charset_detector = NULL;
		std::throw_with_nested(make_error("Error converting text to UTF-8"));
	}
	bool parse_paragraphs = impl().m_parse_paragraphs.v;
	bool parse_lines = impl().m_parse_lines.v;
	sendTag(tag::Document{});
	if (parse_lines || parse_paragraphs)
	{
		std::string::size_type curr_pos = 0;
		enum { outside_paragraph, empty_paragraph, filled_paragraph } paragraph_state = outside_paragraph;
		std::string last_eol = "";
		for (;;)
		{
			std::string::size_type eol_pos = text.find_first_of("\r\n", curr_pos);
			std::string eol = (eol_pos == std::string::npos ? std::string{""} : text.substr(eol_pos, 1));
			if (eol == "\r" && eol_pos + 1 < text.size() && text[eol_pos + 1] == '\n')
				eol += '\n';
			std::string line = text.substr(curr_pos, eol_pos - curr_pos);
			if (parse_paragraphs)
			{
				if (paragraph_state == outside_paragraph)
				{
					sendTag(tag::Paragraph{});
					paragraph_state = empty_paragraph;
				}
				if (line.empty())
				{
					sendTag(tag::CloseParagraph{});
					paragraph_state = outside_paragraph;
				}
				else
				{
					if (paragraph_state == filled_paragraph)
					{
						if (parse_lines)
							sendTag(tag::BreakLine{});
						else
							sendTag(tag::Text{.text = last_eol});
					}
					sendTag(tag::Text{.text = line});
					paragraph_state = filled_paragraph;
				}
			}
			else
			{
				if (!line.empty())
					sendTag(tag::Text{.text = line});
				if (!eol.empty())
				{
					if (parse_lines)
						sendTag(tag::BreakLine{});
					else
						sendTag(tag::Text{.text = eol});
				}
			}
			if (eol.empty())
				break;
			curr_pos = eol_pos + eol.size();
			last_eol = eol;
		}
		if (parse_paragraphs && paragraph_state != outside_paragraph)
			sendTag(tag::CloseParagraph{});
	}
	else
		sendTag(tag::Text{.text = text});
	sendTag(tag::CloseDocument{});
}

} // namespace docwire
  • txt_parser.cpp

Bug with parsing .txt files: The showed content from .txt files is always missing some '\n'.
(e.g. when the file is:
1.hello(\r)\n
2.(\r)\n
3.(\r)\n
4.world
and the output is:
1.hello\n
2.\n
3.world
4.\n
5.\n
) Miss a EOL between line-"hello" and line-"world".

And the revised codes for txt_parser.cpp below should work.

if (parse_paragraphs)
			{
				if (paragraph_state == outside_paragraph)
				{
					sendTag(tag::Paragraph{});
					paragraph_state = empty_paragraph;
				}
				if (line.empty())
				{
// ----------------------------------------------------------------------------
                                        if (paragraph_state == filled_paragraph) {
						sendTag(tag::Text{.text = last_eol});
					}
// ----------------------------------------------------------------------------
					sendTag(tag::CloseParagraph{});
					paragraph_state = outside_paragraph;
				}
@as-ascii
Copy link
Contributor

Thank you very much for your analysis and the fix. We probably add more automatic tests around this topic in the continuous integration procedure and apply the fix in the next release.

Best regards!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants
0