diff --git a/.cross_rubies b/.cross_rubies deleted file mode 100644 index 019ab8c8f2..0000000000 --- a/.cross_rubies +++ /dev/null @@ -1,36 +0,0 @@ -3.1.0:aarch64-linux-gnu -3.1.0:aarch64-linux-musl -3.1.0:arm-linux-gnu -3.1.0:arm-linux-musl -3.1.0:arm64-darwin -3.1.0:x64-mingw-ucrt -3.1.0:x86_64-darwin -3.1.0:x86_64-linux-gnu -3.1.0:x86_64-linux-musl -3.2.0:aarch64-linux-gnu -3.2.0:aarch64-linux-musl -3.2.0:arm-linux-gnu -3.2.0:arm-linux-musl -3.2.0:arm64-darwin -3.2.0:x64-mingw-ucrt -3.2.0:x86_64-darwin -3.2.0:x86_64-linux-gnu -3.2.0:x86_64-linux-musl -3.3.5:aarch64-linux-gnu -3.3.5:aarch64-linux-musl -3.3.5:arm-linux-gnu -3.3.5:arm-linux-musl -3.3.5:arm64-darwin -3.3.5:x64-mingw-ucrt -3.3.5:x86_64-darwin -3.3.5:x86_64-linux-gnu -3.3.5:x86_64-linux-musl -3.4.0:aarch64-linux-gnu -3.4.0:aarch64-linux-musl -3.4.0:arm-linux-gnu -3.4.0:arm-linux-musl -3.4.0:arm64-darwin -3.4.0:x64-mingw-ucrt -3.4.0:x86_64-darwin -3.4.0:x86_64-linux-gnu -3.4.0:x86_64-linux-musl diff --git a/.github/ISSUE_TEMPLATE/2-installation-difficulties.md b/.github/ISSUE_TEMPLATE/2-installation-difficulties.md index 17929f6c70..8fa2743663 100644 --- a/.github/ISSUE_TEMPLATE/2-installation-difficulties.md +++ b/.github/ISSUE_TEMPLATE/2-installation-difficulties.md @@ -44,8 +44,6 @@ the mkmf.log file contents go here What is the output from `ruby -v`? -What is the output from `gem -v`? - What is the output from `gem env`? ``` @@ -53,12 +51,10 @@ the output of "gem env" output goes here ``` -If you're using Bundler: -- what is the output from `bundle version`? -- what is the output from `bundle config`? (Take care to redact any credentials) +If you're installing using Bundler, what is the output from `bundle env`? (Take care to redact any credentials) ``` -the output of "bundle config" goes here +the output of "bundle env" goes here ``` If you're on MacOS, please note: diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 8e5c8f42a7..d7612c655c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,6 +5,9 @@ updates: directory: "/" schedule: interval: "weekly" + groups: + development-dependencies: + dependency-type: "development" - package-ecosystem: "github-actions" directory: "/" schedule: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e66937d28c..bd1c7aca6b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -97,7 +97,7 @@ jobs: outputs: # these are usually the same, but are different once we get to ruby release candidates setup_ruby: "['3.1', '3.2', '3.3', '3.4']" - setup_ruby_win: "['3.1', '3.2', '3.3', 'head']" + setup_ruby_win: "['3.1', '3.2', '3.3', '3.4']" image_tag: "['3.1', '3.2', '3.3', '3.4']" runs-on: ubuntu-latest steps: @@ -297,7 +297,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - uses: cachix/install-nix-action@v30 + - uses: cachix/install-nix-action@v31 with: nix_path: nixpkgs=channel:nixos-24.11 - run: nix-shell --packages ruby bundler --run 'bundle install' diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index a163e703d0..55bc089745 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -16,7 +16,6 @@ on: types: [opened, synchronize] branches: - '*' - jobs: downstream: name: downstream-${{matrix.name}} @@ -32,11 +31,10 @@ jobs: name: rails-html-sanitizer command: "bundle exec rake test" ruby: "3.4" - # # pending a fix for https://github.com/sparklemotion/nokogiri/pull/3348 - # - url: https://github.com/rgrove/sanitize - # name: sanitize - # command: "bundle exec rake test" - # ruby: "3.4" + - url: https://github.com/rgrove/sanitize + name: sanitize + command: "bundle exec rake test" + ruby: "3.4" - url: https://github.com/ebeigarts/signer name: signer command: "bundle exec rake spec" diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index fbe3022a17..1d2cef63d5 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -32,7 +32,7 @@ jobs: - uses: ruby/setup-ruby-pkgs@v1 with: ruby-version: "3.3" - apt-get: "autogen libtool shtool" + apt-get: "autogen libtool shtool liblzma-dev" brew: "automake autogen libtool shtool" mingw: "autotools xz" bundler-cache: true @@ -182,6 +182,10 @@ jobs: - uses: actions/checkout@v4 with: submodules: true + - uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: 21 - uses: ruby/setup-ruby@v1 with: ruby-version: "jruby-head" diff --git a/.rubocop.yml b/.rubocop.yml index def815012c..a1ed3aec24 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -5,12 +5,14 @@ inherit_mode: require: - standard - - standard-custom - - standard-performance - - rubocop-performance + +plugins: - rubocop-minitest - rubocop-packaging + - rubocop-performance - rubocop-rake + - standard-custom + - standard-performance inherit_gem: standard: config/base.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index b9aa6fbd25..e0db75274d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,86 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA --- +## next / unreleased + +### Improved + +* [CRuby] The HTML5 parser now has linear performance when parsing many attributes. Previously performance was quadratic due to two hotspots, one in detecting duplicate attributes and one in constructing the libxml2 data structures. (#3393) @flavorjones + +### Changed + +* The constant `Struct::HTMLElementDescription` is no longer defined. (#3432, #3433) @viralpraxis + + +### Fixed + +* [CRuby] When a namespace is set on an unparented node, ensure the namespace is defined on the node. (#3459, #3462) +* [CRuby] Builder now correctly builds namespaced nodes that define their own namespace when that ns prefix collides with one defined by the parent (or another ancestor). (#3458, #3461) @flavorjones +* [JRuby] Fixed multiple issues with `Node#namespace_definitions` so that it now behaves identically to CRuby. (#2543, #3460) @flavorjones +* [JRuby] `Document#create_element` and `Node.new` no longer set the namespace to the document's default namespace. The namespace must be set explicitly with `namespace=` or by parenting the node. (#3457, #3463) @flavorjones + + +### Dependencies + +* [CRuby] Update to rake-compiler-dock v1.9.1 for building precompiled native gems. (#3404, #3418) @flavorjones + + +## v1.18.8 / 2025-04-21 + +### Security + +* [CRuby] Vendored libxml2 is updated to [v2.13.8](https://gitlab.gnome.org/GNOME/libxml2/-/releases/v2.13.8) to address CVE-2025-32414 and CVE-2025-32415. See [GHSA-5w6v-399v-w3cc](https://github.com/sparklemotion/nokogiri/security/advisories/GHSA-5w6v-399v-w3cc) for more information. + + +## v1.18.7 / 2025-03-31 + +### Dependencies + +* [CRuby] Vendored libxml2 is updated to [v2.13.7](https://gitlab.gnome.org/GNOME/libxml2/-/releases/v2.13.7), which is a bugfix release. + + +## v1.18.6 / 2025-03-24 + +### Fixed + +* [JRuby] In HTML documents, `Node#attribute` now returns the correct attribute. This has been broken, and returning `nil`, since v1.17.0. (#3487) @flavorjones + + +## v1.18.5 / 2025-03-19 + +### Fixed + +* [JRuby] Update JRuby's XML serialization so it outputs namespaces exactly like CRuby. (#3455, #3456) @johnnyshields + + +## v1.18.4 / 2025-03-14 + +### Security + +* [CRuby] Vendored libxslt is updated to [v1.1.43](https://gitlab.gnome.org/GNOME/libxslt/-/releases/v1.1.43) to address CVE-2025-24855 and CVE-2024-55549. See [GHSA-mrxw-mxhj-p664](https://github.com/sparklemotion/nokogiri/security/advisories/GHSA-mrxw-mxhj-p664) for more information. + + +## v1.18.3 / 2025-02-18 + +### Security + +* [CRuby] Vendored libxml2 is updated [v2.13.6](https://gitlab.gnome.org/GNOME/libxml2/-/releases/v2.13.6) to address CVE-2025-24928 and CVE-2024-56171. See [GHSA-vvfq-8hwr-qm4m](https://github.com/sparklemotion/nokogiri/security/advisories/GHSA-vvfq-8hwr-qm4m) for more information. + + +## v1.18.2 / 2024-01-19 + +### Fixed + +* When performing a CSS selector query, an XML document's root namespace declarations should not be applied to wildcard selectors (`"*"`). Fixes a bug introduced in v1.17.0. (#3411) @flavorjones + + +## v1.18.1 / 2024-12-29 + +### Fixed + +* [CRuby] XML::SAX::ParserContext keeps a reference to the input to avoid a potential use-after-free issue that's existed since v1.4.0 (2009). (#3395) @flavorjones + + ## v1.18.0 / 2024-12-25 ### Notable Changes @@ -24,6 +104,11 @@ This release drops precompiled native platform gems for `x86-linux` and `x86-min * [CRuby] CSS and XPath queries are faster now that `Node#xpath`, `Node#css`, and related functions are using a faster XPathContext initialization process. We benchmarked a 1.9x improvement for a 6kb file. Big thanks to @nwellnhof for helping with this one. (#3378, superseded by #3389) @flavorjones +### Dependencies + +* [CRuby] Update to rake-compiler-dock v1.7.0 for building precompiled native gems. (#3375, #3392) @flavorjones + + ## v1.17.2 / 2024-12-12 ### Fixed diff --git a/Gemfile b/Gemfile index d5c42a0b1f..887324ab6a 100644 --- a/Gemfile +++ b/Gemfile @@ -10,31 +10,31 @@ group :development do gem "rake", "13.2.1" # building extensions - gem "rake-compiler", "1.2.8" - gem "rake-compiler-dock", "1.7.0" + gem "rake-compiler", "1.3.0" + gem "rake-compiler-dock", "1.9.1" # parser generator gem "rexical", "1.0.8" # tests - gem "minitest", "5.25.4" + gem "minitest", "5.25.5" gem "minitest-parallel_fork", "2.0.0" - gem "ruby_memcheck", "3.0.0" - gem "rubyzip", "~> 2.3.2" - gem "simplecov", "= 0.21.2" + gem "ruby_memcheck", "3.0.1" + gem "rubyzip", "~> 2.4.1" + gem "simplecov", "0.22.0" # rubocop - gem "standard", "1.43.0" - gem "rubocop-minitest", "0.36.0" - gem "rubocop-packaging", "0.5.2" - gem "rubocop-rake", "0.6.0" + unless RUBY_PLATFORM == "java" + gem "standard", "1.50.0" + gem "rubocop-minitest", "0.38.0" + gem "rubocop-packaging", "0.6.0" + gem "rubocop-rake", "0.7.1" + end end # If Psych doesn't build, you can disable this group locally by running # `bundle config set --local without rdoc` # Then re-run `bundle install`. -unless RUBY_PLATFORM == "java" # see #3391 and https://github.com/jruby/jruby/issues/7262 - group :rdoc do - gem "rdoc", "6.10.0" - end +group :rdoc do + gem "rdoc", "6.14.0" unless RUBY_PLATFORM == "java" || ENV["CI"] end diff --git a/LICENSE-DEPENDENCIES.md b/LICENSE-DEPENDENCIES.md index 1e950b6e37..649027f369 100644 --- a/LICENSE-DEPENDENCIES.md +++ b/LICENSE-DEPENDENCIES.md @@ -15,6 +15,7 @@ Note that this document is broken into multiple sections, each of which describe * [Native WindowsⓇ platform releases ("x64-mingw-ucrt")](#native-windows%E2%93%A1-platform-releases-x64-mingw-ucrt) * [JavaⓇ (JRuby) platform release ("java")](#java%E2%93%A1-jruby-platform-release-java) - [Appendix: Dependencies' License Texts](#appendix-dependencies-license-texts) + * [hashmap.c](#hashmapc) * [libgumbo](#libgumbo) * [libxml2](#libxml2) * [libxslt](#libxslt) @@ -112,6 +113,34 @@ This section contains a subsection for each potentially-distributed dependency, Please see previous sections to understand which of these potential dependencies is actually distributed in the gem file you're downloading and using. +### hashmap.c + +MIT + +https://github.com/tidwall/hashmap.c/blob/master/LICENSE + + The MIT License (MIT) + + Copyright (c) 2020 Joshua J Baker + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ### libgumbo Apache 2.0 diff --git a/README.md b/README.md index df466f4a83..b0d60dd065 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Please use the "Bug Report" or "Installation Difficulties" templates. Please report vulnerabilities at https://hackerone.com/nokogiri -Full information and description of our security policy is in [`SECURITY.md`](SECURITY.md) +See [SECURITY.md](SECURITY.md) for full information and description of our security policy. ### Semantic Versioning Policy @@ -83,7 +83,6 @@ We bump `Major.Minor.Patch` versions following this guidance: `Major`: (we've never done this) - Significant backwards-incompatible changes to the public API that would require rewriting existing application code. -- Some examples of backwards-incompatible changes we might someday consider for a Major release are at [`ROADMAP.md`](ROADMAP.md). `Minor`: @@ -242,7 +241,7 @@ These dependencies are met by default by Nokogiri's packaged versions of the lib We provide native gems by pre-compiling libxml2 and libxslt (and potentially zlib and libiconv) and packaging them into the gem file. In this case, no compilation is necessary at installation time, which leads to faster and more reliable installation. -See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems. +See [LICENSE-DEPENDENCIES.md](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems. ### JRuby @@ -251,29 +250,31 @@ The Java (a.k.a. JRuby) implementation is a Java extension that depends primaril These dependencies are provided by pre-compiled jar files packaged in the `java` platform gem. -See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems. +See [LICENSE-DEPENDENCIES.md](LICENSE-DEPENDENCIES.md) +for more information on which dependencies are provided in which native and source gems. ## Contributing -See [`CONTRIBUTING.md`](CONTRIBUTING.md) for an intro guide to developing Nokogiri. +See [CONTRIBUTING.md](CONTRIBUTING.md) for an intro guide to developing Nokogiri. ## Code of Conduct -We've adopted the Contributor Covenant code of conduct, which you can read in full in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md). +See the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). ## License This project is licensed under the terms of the MIT license. -See this license at [`LICENSE.md`](LICENSE.md). +See [LICENSE.md](LICENSE.md). ### Dependencies -Some additional libraries may be distributed with your version of Nokogiri. Please see [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for a discussion of the variations as well as the licenses thereof. +Some additional libraries may be distributed with your version of Nokogiri. +See [LICENSE-DEPENDENCIES.md](LICENSE-DEPENDENCIES.md) for a discussion of the variations as well as the licenses thereof. ## Authors diff --git a/dependencies.yml b/dependencies.yml index c8246b85bd..276d991b92 100644 --- a/dependencies.yml +++ b/dependencies.yml @@ -1,13 +1,13 @@ --- libxml2: - version: "2.13.5" - sha256: "74fc163217a3964257d3be39af943e08861263c4231f9ef5b496b6f6d4c7b2b6" - # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.13/libxml2-2.13.5.sha256sum + version: "2.13.8" + sha256: "277294cb33119ab71b2bc81f2f445e9bc9435b893ad15bb2cd2b0e859a0ee84a" + # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.13/libxml2-2.13.8.sha256sum libxslt: - version: "1.1.42" - sha256: "85ca62cac0d41fc77d3f6033da9df6fd73d20ea2fc18b0a3609ffb4110e1baeb" - # sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.42.sha256sum + version: "1.1.43" + sha256: "5a3d6b383ca5afc235b171118e90f5ff6aa27e9fea3303065231a6d403f0183a" + # sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.43.sha256sum zlib: version: "1.3.1" diff --git a/doc/examples/bookstore_setup.rb b/doc/examples/bookstore_setup.rb new file mode 100644 index 0000000000..2a94d9fce1 --- /dev/null +++ b/doc/examples/bookstore_setup.rb @@ -0,0 +1,33 @@ +require 'nokogiri' +BOOKSTORE_XML = <<-BOOKSTORE # Bookstore XML as a string. + + + Everyday Italian + Giada De Laurentiis + 2005 + 30.00 + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + XQuery Kick Start + James McGovern + Per Bothner + Kurt Cagle + James Linn + Vaidyanathan Nagarajan + 2003 + 49.99 + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + +BOOKSTORE diff --git a/doc/keyword_arguments.md b/doc/keyword_arguments.md new file mode 100644 index 0000000000..0db206ea8c --- /dev/null +++ b/doc/keyword_arguments.md @@ -0,0 +1,139 @@ +## Keyword Arguments + +Many \Nokogiri methods take optional *positional* arguments. +Beginning in version 1.17.0 (December 2024), +some methods are being "modernized" to take optional *keyword* arguments, +which are far more versatile. + +Example: + +```ruby +# Before. +XML::Document.parse(xml_s, nil, nil, options) +# After. +XML::Document.parse(xml_s, options: options) +``` + +### About the Examples + +Examples on this page assume that the following code has been executed: + +```ruby +require 'nokogiri' +include Nokogiri +xml_s = '' +url = 'www.site.com' +encoding = 'UTF-16' +options = XML::ParseOptions::STRICT +``` + +### Before + +Before the changes, the calling sequence for a method might have +trailing optional positional arguments: + +For example, the calling sequence `XML::Document.parse` was: + +``` +XML::Document.parse( + # Required leading argument. + input, + # Optional positional arguments. + url = nil, + encoding = nil, + options = XML::ParseOptions::DEFAULT_XML + ) +``` + +That calling sequence requires leading argument `input`, +and allows any of these: + +- No optional arguments. +- Optional argument `url` only. +- Optional arguments `url` and `encoding` only. +- Optional arguments `url`, `encoding`, and `options`. + +To pass arguments `input` and `options`, +a method call would also have to pass arguments `url` and `encoding`: + +``` +XML::Document.parse(xml_s, nil, nil, options) +``` + +### After + +The updated calling sequence allows trailing *keyword* arguments. + +The updated calling sequence for `XML::Document.parse`, for example, +allows optional keyword arguments `url`, `encoding`, and `options`. + +The updated calling sequence may be thought of as: + +``` +XML::Document.parse( + # Required leading argument. + input, + # Optional keyword arguments. + url:, + encoding:, + options: + ) +``` + +where `url`, `encoding`, and `options` are optional keyword arguments. +Thus, to pass arguments `input` and `options`, +a method call need only pass those two arguments (and not arguments `url` and `encoding`): + +```ruby +XML::Document.parse(xml_s, options: options) +``` + +Each of the optional keyword arguments may be given or omitted; +they may be given in any combination and in any order: + +```ruby +XML::Document.parse(xml_s, options: options, encoding: encoding, url: url) +``` + +The new calling sequence is fully compatible with the old, +so that this is still a valid call: + +```ruby +XML::Document.parse(xml_s, url, encoding, options) +``` + +### Details + +The updated calling sequence retains the optional positional arguments, +but adds trailing keyword arguments; +the default value for each keyword argument comes from the given (or default) +value of a positional argument. + +The actual updated calling sequence for `XML::Document.parse`, for example, is: + +```ruby +XML::Documnent.parse( + # Required leading argument. + input, + # Optional positional arguments. + url_ = nil, + encoding_ = nil, + options_ = XML::ParseOptions::DEFAULT_XML, + # Optional keyword arguments; each defaults to a positional argument value. + url: url_, + encoding: encoding_, + options: options_ +) +``` + +Valid calls to the method include: + +```ruby +# Positional arguments only. +XML::Document.parse(xml_s, url, encoding, options) +# Keyword arguments only, any order. +XML::Document.parse(xml_s, url: url, encoding: encoding, options: options) +XML::Document.parse(xml_s, encoding: encoding, options: options, url: url) +# Mixture of leading positional arguments and trailing keyword arguments. +XML::Document.parse(xml_s, url, options: options, encoding: encoding) +``` diff --git a/doc/xml/parsing.md b/doc/xml/parsing.md new file mode 100644 index 0000000000..1df120767d --- /dev/null +++ b/doc/xml/parsing.md @@ -0,0 +1,497 @@ +# Parsing + +This page shows how \Nokogiri parses an XML string into \Nokogiri objects. +The string has text consisting of character data and markup. +For a \Nokogiri parsing method, the string is passed +either as a [String](https://docs.ruby-lang.org/en/master/String.html) object +or as an [IO](https://docs.ruby-lang.org/en/master/IO.html) object +from which the string is to be read. + +Most of the sections below link to a relevant section in the W3C document +[Extensible Markup Language (XML) 1.0 (Fifth Edition)](https://www.w3.org/TR/REC-xml/). + + +On this page, each example uses either: + +- Method Nokogiri::XML::parse (shorthand for Nokogiri::XML::Document.parse) + to parse a string into a tree of \Nokogiri objects. + The topmost object is a Nokogiri::XML::Document object, + which we will usually refer to as a document. + + A document may be childless (i.e., it may have no immediate child object), + or it may have a single immediate child Nokogiri::XML::Element object -- its root. + +- Method Nokogiri::XML::DocumentFragment.parse + to parse a string into a tree of \Nokogiri objects. + The topmost object is a Nokogiri::XML::DocumentFragment object, + which we will usually refer to as a fragment; + the fragment may have other objects as children. + + A document fragment may have multiple immediate child objects of various types. + +## Text + +The string to be parsed is text, consisting of +[character data and markup](https://www.w3.org/TR/REC-xml/#syntax). + +## Character Data + +All text that is not markup it character data. + +## Markup + +### Comments + +\Nokogiri parses an [XML comment](https://www.w3.org/TR/REC-xml/#sec-comments) +into a Nokogiri::XML::Comment object. + +A comment may be in the document itself or in a tag: + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Comment " Comment. "), + #(Element: { + name = "root", + children = [ #(Comment " Another comment. ")] + })] + }) +``` + +### Processing Instructions + +\Nokogiri parses an [XML processing instruction](https://www.w3.org/TR/REC-xml/#sec-pi) +into a Nokogiri::XML::ProcessingInstruction object: + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(ProcessingInstruction: { + name = "xml-stylesheet" + })] + }) +``` + +### CDATA Sections + +\Nokogiri parses an [XML CDATA section](https://www.w3.org/TR/REC-xml/#sec-cdata-sect) +into a Nokogiri::XML::CDATA object: + +``` +xml = 'Hello, world!]]>' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "root", + children = [ + #(CDATA "Hello, world!")] + })] + }) +``` + +### Prolog (XML Declaration) + +\Nokogiri parses an [XML declaration](https://www.w3.org/TR/REC-xml/#sec-prolog-dtd) +into values put onto the parsed document: + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc.version # => "1.0" +doc.encoding # => "UTF-8" +``` + +### Document Type Declaration + +\Nokogiri parses an [XML document type declaration](https://www.w3.org/TR/REC-xml/#sec-prolog-dtd) +into a Nokogiri::XML::DTD object: + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ #(DTD: { name = "greeting" })] + }) + ``` + +### Tags + +\Nokogiri parses an [XML tag](https://www.w3.org/TR/REC-xml/#sec-starttags) +into a Nokogiri::XML::Element object. + +In this example, a single tag is parsed into a document whose only child +is the root element parsed from the tag: + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ #(Element: { name = "root" })] + }) +``` + +A tag may have nested tags: + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "root", + children = [ + #(Element: { + name = "foo", + children = [ + #(Element: { name = "goo" }), + #(Element: { name = "moo" })] + }), + #(Element: { + name = "bar", + children = [ + #(Element: { name = "car" }), + #(Element: { name = "far" })] + })] + })] + }) +``` + +A tag may have nested text: + +``` +xml = 'OneTwoThreeFourFive' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "root", + children = [ + #(Text "One"), + #(Element: { + name = "foo", + children = [ #(Text "Two")] + }), + #(Text "Three"), + #(Element: { + name = "bar", + children = [ #(Text "Four")] + }), + #(Text "Five")] + })] + }) +``` + +A tag may have nested markup of other types (such as comments): + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "root", + children = [ + #(Element: { name = "foo" }), + #(Comment " Comment text. "), + #(Element: { name = "bar" })] + })] + }) +``` + +### Tag Attributes + +\Nokogiri parses an [XML tag attribute](https://www.w3.org/TR/REC-xml/#NT-Attribute) +into a Nokogiri::XML::Attr object: + +``` +xml = '' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "root", + attribute_nodes = [ + #(Attr: { name = "foo", value = "0" }), + #(Attr: { name = "bar", value = "1" })] + })] + }) +``` + +### Element Type Declarations + +\Nokogiri parses an [XML element type declaration](https://www.w3.org/TR/REC-xml/#elemdecls) +into a Nokogiri::XML::ElementDecl object: + +``` +xml = < + + + + +]> +DOCTYPE +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(DTD: { + name = "note", + children = [ + #(ElementDecl: { "\n" }), + #(ElementDecl: { "\n" }), + #(ElementDecl: { "\n" }), + #(ElementDecl: { "\n" }), + #(ElementDecl: { "\n" })] + })] + }) + ``` + +### Attribute-List Declarations + +\Nokogiri parses an [XML attribute-list declaration](https://www.w3.org/TR/REC-xml/#attdecls) +into a Nokogiri::XML::AttributeDecl object: + +``` +xml = < + +]> +DOCTYPE +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(DTD: { + name = "note", + children = [ + #(ElementDecl: { "\n" }), + #(AttributeDecl: { "\n" })] + })] + }) +``` + +### Conditional Sections + +\Nokogiri parses an [XML conditional section](https://www.w3.org/TR/REC-xml/#sec-condition-sect) +into a Nokogiri::XML::EntityDecl object: + +``` +xml = < +]> +DOCTYPE +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(DTD: { + name = "note", + children = [ #(EntityDecl: { "\n" })] + })] + }) +``` + +### Character References + +\Nokogiri parses an [XML character reference](https://www.w3.org/TR/REC-xml/#sec-references) +(such as &9792;) +and replaces it with a character such as ('♀'): + +``` +xml = < + + Marie + Müller + + + +ELE +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "root", + children = [ + #(Text "\n "), + #(Element: { + name = "name", + children = [ + #(Text "\n "), + #(Element: { name = "vorname", children = [ #(Text "Marie")] }), + #(Text "\n "), + #(Element: { name = "nachname", children = [ #(Text "Müller")] }), + #(Text "\n "), + #(Element: { name = "geschlecht", children = [ #(Text "♀")] }), + #(Text "\n ")] + }), + #(Text "\n")] + })] + }) +``` + +### Entity References + +\Nokogiri parses an [XML entity reference](https://www.w3.org/TR/REC-xml/#sec-references) +(such as &lt;) +and replaces it with text such as ('<'): + +``` +xml = 'An entity reference is needed for the less-than character (<).' +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "root", + children = [ #(Text "An entity reference is needed for the less-than character (<).")] + })] + }) +``` + +### Entity Declarations + +\Nokogiri parses an [XML entity declaration](https://www.w3.org/TR/REC-xml/#sec-entity-decl) +into a Nokogiri::XML::EntityDecl object: + +``` +xml = < +]> +DTD +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(DTD: { + name = "note", + children = [ #(EntityDecl: { "\n" })] + })] + }) +``` + +## Document Fragments + +When an XML string has more than one top-level tag, +\Nokogiri *document parsing* captures only the first top-level tag +(which becomes the root element) +and ignores other top-level tags (and their children); +this may not be the desired result: + +``` +xml = < + + + + + +FRAGMENT +doc = Nokogiri::XML.parse(xml) +doc +# => +#(Document: { + name = "document", + children = [ + #(Element: { + name = "top0", + children = [ + #(Text "\n "), + #(Element: { name = "ele0" }), + #(Text "\n "), + #(Element: { name = "ele1" }), + #(Text "\n")] + })] + })``` + +To capture all top-level tags, use \Nokogiri *fragment parsing* +via method Nokogiri::XML::DocumentFragment.parse: + +``` +xml = < + + + + + +FRAGMENT +fragment = Nokogiri::XML::DocumentFragment.parse(xml) +fragment +# => +#(DocumentFragment: { + name = "#document-fragment", + children = [ + #(Element: { + name = "top0", + children = [ + #(Text "\n "), + #(Element: { name = "ele0" }), + #(Text "\n "), + #(Element: { name = "ele1" }), + #(Text "\n")] + }), + #(Text "\n"), + #(Element: { name = "top1" }), + #(Text "\n"), + #(Element: { name = "top2" }), + #(Text "\n")] + }) +``` + +Note that: + +- The returned object is a Nokogiri::XML::DocumentFragment object + (not a Nokigiri::XML::Document object). +- The fragment has three children of class Nokogiri::XML::Element + (which in a document is not allowed). diff --git a/ext/java/nokogiri/Html4Document.java b/ext/java/nokogiri/Html4Document.java index 6a57bb82aa..b5382a0690 100644 --- a/ext/java/nokogiri/Html4Document.java +++ b/ext/java/nokogiri/Html4Document.java @@ -52,7 +52,7 @@ public class Html4Document extends XmlDocument super(ruby, klazz, doc); } - @JRubyMethod(name = "new", meta = true, rest = true, required = 0) + @JRubyMethod(name = "new", meta = true, rest = true) public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) { diff --git a/ext/java/nokogiri/Html4ElementDescription.java b/ext/java/nokogiri/Html4ElementDescription.java index 8613245914..eb6163c030 100644 --- a/ext/java/nokogiri/Html4ElementDescription.java +++ b/ext/java/nokogiri/Html4ElementDescription.java @@ -33,7 +33,7 @@ public class Html4ElementDescription extends RubyObject static { Map> _subElements = - new HashMap>(); + new HashMap<>(); subElements = Collections.synchronizedMap(_subElements); } @@ -56,7 +56,7 @@ public class Html4ElementDescription extends RubyObject List subs = subElements.get(elem.code); if (subs == null) { - subs = new ArrayList(); + subs = new ArrayList<>(); /* * A bit of a hack. NekoHtml source code shows that @@ -127,6 +127,7 @@ public class Html4ElementDescription extends RubyObject ary[i] = ruby.newString(subs.get(i)); } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods return ruby.newArray(ary); } diff --git a/ext/java/nokogiri/Html4EntityLookup.java b/ext/java/nokogiri/Html4EntityLookup.java index 5d98f1f8c9..e0184d0c54 100644 --- a/ext/java/nokogiri/Html4EntityLookup.java +++ b/ext/java/nokogiri/Html4EntityLookup.java @@ -29,7 +29,7 @@ public class Html4EntityLookup extends RubyObject /** * Looks up an HTML entity key. - * + *

* The description is a bit lacking. */ @JRubyMethod() @@ -53,11 +53,10 @@ public class Html4EntityLookup extends RubyObject IRubyObject edClass = ruby.getClassFromPath("Nokogiri::HTML4::EntityDescription"); - IRubyObject edObj = invoke(context, edClass, "new", + + return invoke(context, edClass, "new", ruby.newFixnum(val), ruby.newString(name), ruby.newString(name + " entity")); - - return edObj; } } diff --git a/ext/java/nokogiri/Html4SaxParserContext.java b/ext/java/nokogiri/Html4SaxParserContext.java index e167f11a3a..0bef93be3f 100644 --- a/ext/java/nokogiri/Html4SaxParserContext.java +++ b/ext/java/nokogiri/Html4SaxParserContext.java @@ -1,6 +1,5 @@ package nokogiri; -import java.io.ByteArrayInputStream; import java.io.InputStream; import org.apache.xerces.parsers.AbstractSAXParser; @@ -8,8 +7,6 @@ import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyEncoding; -import org.jruby.RubyFixnum; -import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.runtime.ThreadContext; @@ -17,7 +14,6 @@ import org.xml.sax.SAXException; import nokogiri.internals.NokogiriHandler; -import static nokogiri.internals.NokogiriHelpers.rubyStringToString; import static org.jruby.runtime.Helpers.invoke; @@ -65,7 +61,7 @@ public class Html4SaxParserContext extends XmlSaxParserContext return parser; } catch (SAXException ex) { throw new SAXException( - "Problem while creating HTML4 SAX Parser: " + ex.toString()); + "Problem while creating HTML4 SAX Parser: " + ex); } } @@ -76,9 +72,10 @@ public class Html4SaxParserContext extends XmlSaxParserContext String java_encoding = null; if (encoding != context.runtime.getNil()) { if (!(encoding instanceof RubyEncoding)) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("encoding must be kind_of Encoding"); } - java_encoding = ((RubyEncoding)encoding).toString(); + java_encoding = encoding.toString(); } Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz); @@ -98,9 +95,10 @@ public class Html4SaxParserContext extends XmlSaxParserContext String java_encoding = null; if (encoding != context.runtime.getNil()) { if (!(encoding instanceof RubyEncoding)) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("encoding must be kind_of Encoding"); } - java_encoding = ((RubyEncoding)encoding).toString(); + java_encoding = encoding.toString(); } Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass); @@ -118,15 +116,17 @@ public class Html4SaxParserContext extends XmlSaxParserContext parse_io(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding) { if (!invoke(context, data, "respond_to?", context.runtime.newSymbol("read")).isTrue()) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("argument expected to respond to :read"); } String java_encoding = null; if (encoding != context.runtime.getNil()) { if (!(encoding instanceof RubyEncoding)) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("encoding must be kind_of Encoding"); } - java_encoding = ((RubyEncoding)encoding).toString(); + java_encoding = encoding.toString(); } Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz); diff --git a/ext/java/nokogiri/Html4SaxPushParser.java b/ext/java/nokogiri/Html4SaxPushParser.java index c338fd3696..3ef20427be 100644 --- a/ext/java/nokogiri/Html4SaxPushParser.java +++ b/ext/java/nokogiri/Html4SaxPushParser.java @@ -24,7 +24,6 @@ /** * Class for Nokogiri::HTML4::SAX::PushParser * - * @author * @author Piotr Szmielew - based on Nokogiri::XML::SAX::PushParser */ @JRubyClass(name = "Nokogiri::HTML4::SAX::PushParser") @@ -93,6 +92,7 @@ public class Html4SaxPushParser extends RubyObject setOptions(ThreadContext context, IRubyObject opts) { invoke(context, parse_options(context), "options=", opts); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods options = new ParserContext.Options(opts.convertToInteger().getLongValue()); return getOptions(context); } @@ -148,14 +148,11 @@ public class Html4SaxPushParser extends RubyObject assert saxParser != null : "saxParser null"; parserTask = new ParserTask(context, saxParser, stream); futureTask = new FutureTask((Callable) parserTask); - executor = Executors.newSingleThreadExecutor(new ThreadFactory() { - @Override - public Thread newThread(Runnable r) { - Thread t = new Thread(r); - t.setName("Html4SaxPushParser"); - t.setDaemon(true); - return t; - } + executor = Executors.newSingleThreadExecutor(r -> { + Thread t = new Thread(r); + t.setName("Html4SaxPushParser"); + t.setDaemon(true); + return t; }); executor.submit(futureTask); } @@ -168,8 +165,6 @@ public Thread newThread(Runnable r) { try { terminateImpl(); - } catch (InterruptedException e) { - throw runtime.newRuntimeError(e.toString()); } catch (Exception e) { throw runtime.newRuntimeError(e.toString()); } diff --git a/ext/java/nokogiri/NokogiriService.java b/ext/java/nokogiri/NokogiriService.java index 0f792dea62..15d3511f50 100644 --- a/ext/java/nokogiri/NokogiriService.java +++ b/ext/java/nokogiri/NokogiriService.java @@ -5,7 +5,6 @@ import java.util.Map; import org.jruby.Ruby; -import org.jruby.RubyArray; import org.jruby.RubyClass; import org.jruby.RubyFixnum; import org.jruby.RubyModule; @@ -33,13 +32,14 @@ public class NokogiriService implements BasicLibraryService public static Map getNokogiriClassCache(Ruby ruby) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods return (Map) ruby.getModule("Nokogiri").getInternalVariable("cache"); } private static Map populateNokogiriClassCache(Ruby ruby) { - Map nokogiriClassCache = new HashMap(); + Map nokogiriClassCache = new HashMap<>(); nokogiriClassCache.put("Nokogiri::HTML4::Document", (RubyClass)ruby.getClassFromPath("Nokogiri::HTML4::Document")); nokogiriClassCache.put("Nokogiri::HTML4::ElementDescription", (RubyClass)ruby.getClassFromPath("Nokogiri::HTML4::ElementDescription")); @@ -78,6 +78,7 @@ public class NokogiriService implements BasicLibraryService private void init(Ruby ruby) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyModule nokogiri = ruby.defineModule("Nokogiri"); RubyModule xmlModule = nokogiri.defineModuleUnder("XML"); RubyModule xmlSaxModule = xmlModule.defineModuleUnder("SAX"); @@ -97,6 +98,7 @@ public class NokogiriService implements BasicLibraryService private void createSyntaxErrors(Ruby ruby, RubyModule nokogiri, RubyModule xmlModule) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyClass syntaxError = nokogiri.defineClassUnder("SyntaxError", ruby.getStandardError(), ruby.getStandardError().getAllocator()); RubyClass xmlSyntaxError = xmlModule.defineClassUnder("SyntaxError", syntaxError, XML_SYNTAXERROR_ALLOCATOR); @@ -106,6 +108,7 @@ public class NokogiriService implements BasicLibraryService private RubyClass createXmlModule(Ruby ruby, RubyModule xmlModule) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyClass node = xmlModule.defineClassUnder("Node", ruby.getObject(), XML_NODE_ALLOCATOR); node.defineAnnotatedMethods(XmlNode.class); @@ -183,6 +186,7 @@ public class NokogiriService implements BasicLibraryService private void createHtmlModule(Ruby ruby, RubyModule htmlModule) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyClass htmlElemDesc = htmlModule.defineClassUnder("ElementDescription", ruby.getObject(), HTML_ELEMENT_DESCRIPTION_ALLOCATOR); htmlElemDesc.defineAnnotatedMethods(Html4ElementDescription.class); @@ -195,6 +199,7 @@ public class NokogiriService implements BasicLibraryService private void createDocuments(Ruby ruby, RubyModule xmlModule, RubyModule htmlModule, RubyClass node) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyClass xmlDocument = xmlModule.defineClassUnder("Document", node, XML_DOCUMENT_ALLOCATOR); xmlDocument.defineAnnotatedMethods(XmlDocument.class); @@ -206,6 +211,7 @@ public class NokogiriService implements BasicLibraryService private void createSaxModule(Ruby ruby, RubyModule xmlSaxModule, RubyModule htmlSaxModule) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyClass xmlSaxParserContext = xmlSaxModule.defineClassUnder("ParserContext", ruby.getObject(), XML_SAXPARSER_CONTEXT_ALLOCATOR); xmlSaxParserContext.defineAnnotatedMethods(XmlSaxParserContext.class); @@ -225,6 +231,7 @@ public class NokogiriService implements BasicLibraryService private void createXsltModule(Ruby ruby, RubyModule xsltModule) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyClass stylesheet = xsltModule.defineClassUnder("Stylesheet", ruby.getObject(), XSLT_STYLESHEET_ALLOCATOR); stylesheet.defineAnnotatedMethods(XsltStylesheet.class); } @@ -259,21 +266,9 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { } }; - private static ObjectAllocator HTML_ELEMENT_DESCRIPTION_ALLOCATOR = - new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new Html4ElementDescription(runtime, klazz); - } - }; + private static final ObjectAllocator HTML_ELEMENT_DESCRIPTION_ALLOCATOR = Html4ElementDescription::new; - private static ObjectAllocator HTML_ENTITY_LOOKUP_ALLOCATOR = - new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new Html4EntityLookup(runtime, klazz); - } - }; + private static final ObjectAllocator HTML_ENTITY_LOOKUP_ALLOCATOR = Html4EntityLookup::new; public static final ObjectAllocator XML_ATTR_ALLOCATOR = new ObjectAllocator() { @@ -486,25 +481,12 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { } }; - private static ObjectAllocator XML_ATTRIBUTE_DECL_ALLOCATOR = new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new XmlAttributeDecl(runtime, klazz); - } - }; + private static final ObjectAllocator XML_ATTRIBUTE_DECL_ALLOCATOR = XmlAttributeDecl::new; - private static ObjectAllocator XML_ENTITY_DECL_ALLOCATOR = new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new XmlEntityDecl(runtime, klazz); - } - }; + private static final ObjectAllocator XML_ENTITY_DECL_ALLOCATOR = XmlEntityDecl::new; - private static ObjectAllocator XML_ELEMENT_CONTENT_ALLOCATOR = new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - throw runtime.newNotImplementedError("not implemented"); - } + private static final ObjectAllocator XML_ELEMENT_CONTENT_ALLOCATOR = (runtime, klazz) -> { + throw runtime.newNotImplementedError("not implemented"); }; public static final ObjectAllocator XML_RELAXNG_ALLOCATOR = new ObjectAllocator() @@ -537,19 +519,9 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { } }; - private static final ObjectAllocator XML_SAXPUSHPARSER_ALLOCATOR = new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new XmlSaxPushParser(runtime, klazz); - } - }; + private static final ObjectAllocator XML_SAXPUSHPARSER_ALLOCATOR = XmlSaxPushParser::new; - private static final ObjectAllocator HTML_SAXPUSHPARSER_ALLOCATOR = new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new Html4SaxPushParser(runtime, klazz); - } - }; + private static final ObjectAllocator HTML_SAXPUSHPARSER_ALLOCATOR = Html4SaxPushParser::new; public static final ObjectAllocator XML_SCHEMA_ALLOCATOR = new ObjectAllocator() { @@ -566,12 +538,7 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { } }; - public static final ObjectAllocator XML_SYNTAXERROR_ALLOCATOR = new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new XmlSyntaxError(runtime, klazz); - } - }; + public static final ObjectAllocator XML_SYNTAXERROR_ALLOCATOR = XmlSyntaxError::new; public static final ObjectAllocator XML_TEXT_ALLOCATOR = new ObjectAllocator() { @@ -588,12 +555,7 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { } }; - public static final ObjectAllocator XML_XPATHCONTEXT_ALLOCATOR = new ObjectAllocator() - { - public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new XmlXpathContext(runtime, klazz); - } - }; + public static final ObjectAllocator XML_XPATHCONTEXT_ALLOCATOR = XmlXpathContext::new; public static ObjectAllocator XSLT_STYLESHEET_ALLOCATOR = new ObjectAllocator() { diff --git a/ext/java/nokogiri/XmlAttr.java b/ext/java/nokogiri/XmlAttr.java index 029f6f407f..6273086d8e 100644 --- a/ext/java/nokogiri/XmlAttr.java +++ b/ext/java/nokogiri/XmlAttr.java @@ -27,6 +27,8 @@ public class XmlAttr extends XmlNode { private static final long serialVersionUID = 1L; + // unused + @Deprecated public static final String[] HTML_BOOLEAN_ATTRS = { "checked", "compact", "declare", "defer", "disabled", "ismap", "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", @@ -45,6 +47,8 @@ public class XmlAttr extends XmlNode super(ruby, rubyClass); } + // unused + @Deprecated public XmlAttr(Ruby ruby, RubyClass rubyClass, Node attr) { @@ -56,6 +60,7 @@ public class XmlAttr extends XmlNode init(ThreadContext context, IRubyObject[] args) { if (args.length < 2) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newArgumentError(args.length, 2); } diff --git a/ext/java/nokogiri/XmlAttributeDecl.java b/ext/java/nokogiri/XmlAttributeDecl.java index 242fc804fd..0b7ffc44e6 100644 --- a/ext/java/nokogiri/XmlAttributeDecl.java +++ b/ext/java/nokogiri/XmlAttributeDecl.java @@ -32,7 +32,7 @@ public class XmlAttributeDecl extends XmlNode /** * Initialize based on an attributeDecl node from a NekoDTD parsed * DTD. - * + *

* Internally, XmlAttributeDecl combines these into a single node. */ public @@ -102,17 +102,19 @@ public class XmlAttributeDecl extends XmlNode { final String atype = ((Element) node).getAttribute("atype"); - if (atype != null && atype.length() != 0 && atype.charAt(0) == '(') { + if (!atype.isEmpty() && atype.charAt(0) == '(') { // removed enclosing parens String valueStr = atype.substring(1, atype.length() - 1); String[] values = valueStr.split("\\|"); RubyArray enumVals = RubyArray.newArray(context.runtime, values.length); - for (int i = 0; i < values.length; i++) { - enumVals.append(context.runtime.newString(values[i])); + for (String value : values) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods + enumVals.append(context.runtime.newString(value)); } return enumVals; } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods return context.runtime.newEmptyArray(); } diff --git a/ext/java/nokogiri/XmlCdata.java b/ext/java/nokogiri/XmlCdata.java index f465a56a9e..719feecca8 100644 --- a/ext/java/nokogiri/XmlCdata.java +++ b/ext/java/nokogiri/XmlCdata.java @@ -30,6 +30,8 @@ public class XmlCdata extends XmlText super(ruby, rubyClass); } + // unused + @Deprecated public XmlCdata(Ruby ruby, RubyClass rubyClass, Node node) { @@ -41,16 +43,19 @@ public class XmlCdata extends XmlText init(ThreadContext context, IRubyObject[] args) { if (args.length < 2) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw getRuntime().newArgumentError(args.length, 2); } IRubyObject rbDocument = args[0]; content = args[1]; if (content.isNil()) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("expected second parameter to be a String, received NilClass"); } if (!(rbDocument instanceof XmlNode)) { String msg = "expected first parameter to be a Nokogiri::XML::Document, received " + rbDocument.getMetaClass(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError(msg); } if (!(rbDocument instanceof XmlDocument)) { diff --git a/ext/java/nokogiri/XmlComment.java b/ext/java/nokogiri/XmlComment.java index f77a91e366..c9f53d6b02 100644 --- a/ext/java/nokogiri/XmlComment.java +++ b/ext/java/nokogiri/XmlComment.java @@ -23,6 +23,8 @@ public class XmlComment extends XmlNode { private static final long serialVersionUID = 1L; + // unused + @Deprecated public XmlComment(Ruby ruby, RubyClass rubyClass, Node node) { @@ -40,6 +42,7 @@ public class XmlComment extends XmlNode init(ThreadContext context, IRubyObject[] args) { if (args.length < 2) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw getRuntime().newArgumentError(args.length, 2); } diff --git a/ext/java/nokogiri/XmlDocument.java b/ext/java/nokogiri/XmlDocument.java index 033a0c64b8..d2ac592050 100644 --- a/ext/java/nokogiri/XmlDocument.java +++ b/ext/java/nokogiri/XmlDocument.java @@ -41,7 +41,6 @@ import nokogiri.internals.SaveContextVisitor; import nokogiri.internals.XmlDomParserContext; import nokogiri.internals.c14n.CanonicalFilter; -import nokogiri.internals.c14n.CanonicalizationException; import nokogiri.internals.c14n.Canonicalizer; /** @@ -71,9 +70,6 @@ public class XmlDocument extends XmlNode private static final ByteList DOCUMENT = ByteList.create("document"); static { DOCUMENT.setEncoding(USASCIIEncoding.INSTANCE); } - private static boolean substituteEntities = false; - private static boolean loadExternalSubset = false; // TODO: Verify this. - /** cache variables */ protected IRubyObject encoding; protected IRubyObject url; @@ -273,7 +269,7 @@ private static class DocumentBuilderFactoryHolder * * Create a new document with +version+ (defaults to "1.0") */ - @JRubyMethod(name = "new", meta = true, rest = true, required = 0) + @JRubyMethod(name = "new", meta = true, rest = true) public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) { @@ -281,6 +277,7 @@ private static class DocumentBuilderFactoryHolder XmlDocument xmlDocument; try { Document docNode = createNewDocument(runtime); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods if ("Nokogiri::HTML4::Document".equals(((RubyClass)klazz).getName())) { xmlDocument = new Html4Document(context.runtime, (RubyClass) klazz, docNode); } else { @@ -344,7 +341,6 @@ private static class DocumentBuilderFactoryHolder public static IRubyObject load_external_subsets_set(ThreadContext context, IRubyObject cls, IRubyObject value) { - XmlDocument.loadExternalSubset = value.isTrue(); return context.nil; } @@ -395,8 +391,8 @@ private static class DocumentBuilderFactoryHolder } } IRubyObject[] nodes = xmlNode.getChildren(); - for (int i = 0; i < nodes.length; i++) { - XmlNode childNode = (XmlNode) nodes[i]; + for (IRubyObject iRubyObject : nodes) { + XmlNode childNode = (XmlNode) iRubyObject; removeNamespaceRecursively(childNode); } } @@ -471,7 +467,6 @@ private static class DocumentBuilderFactoryHolder public static IRubyObject substitute_entities_set(ThreadContext context, IRubyObject cls, IRubyObject value) { - XmlDocument.substituteEntities = value.isTrue(); return context.nil; } @@ -631,16 +626,18 @@ private static class DocumentBuilderFactoryHolder { int mode = 0; String inclusive_namespace = null; - Boolean with_comments = false; + boolean with_comments = false; if (args.length > 0 && !(args[0].isNil())) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods mode = RubyFixnum.fix2int(args[0]); } if (args.length > 1) { if (!args[1].isNil() && !(args[1] instanceof List)) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("Expected array"); } if (!args[1].isNil()) { - inclusive_namespace = ((RubyArray)args[1]) + inclusive_namespace = ((RubyArray)args[1]) .join(context, context.runtime.newString(" ")) .asString() .asJavaString(); // OMG I wish I knew JRuby better, this is ugly diff --git a/ext/java/nokogiri/XmlDocumentFragment.java b/ext/java/nokogiri/XmlDocumentFragment.java index 406b3d1d1c..59a8b4fd2c 100644 --- a/ext/java/nokogiri/XmlDocumentFragment.java +++ b/ext/java/nokogiri/XmlDocumentFragment.java @@ -1,29 +1,13 @@ package nokogiri; -import static nokogiri.internals.NokogiriHelpers.getLocalNameForNamespace; import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; -import static nokogiri.internals.NokogiriHelpers.getPrefix; -import static nokogiri.internals.NokogiriHelpers.isNamespace; -import static nokogiri.internals.NokogiriHelpers.rubyStringToString; - -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.jruby.Ruby; -import org.jruby.RubyArray; import org.jruby.RubyClass; -import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; -import org.jruby.runtime.Block; -import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; -import org.jruby.util.ByteList; -import org.w3c.dom.Attr; -import org.w3c.dom.NamedNodeMap; /** * Class for Nokogiri::XML::DocumentFragment @@ -36,6 +20,8 @@ public class XmlDocumentFragment extends XmlNode { private static final long serialVersionUID = 1L; + // unused + @Deprecated public XmlDocumentFragment(Ruby ruby) { diff --git a/ext/java/nokogiri/XmlDtd.java b/ext/java/nokogiri/XmlDtd.java index 7dc96c1316..3417872133 100644 --- a/ext/java/nokogiri/XmlDtd.java +++ b/ext/java/nokogiri/XmlDtd.java @@ -138,7 +138,7 @@ public class XmlDtd extends XmlNode * doc. The attached dtd must be the tree from * NekoDTD. The owner document of the returned tree will be * doc. - * + *

* NekoDTD parser returns a new document node containing elements * representing the dtd declarations. The plan is to get the root * element and adopt it into the correct document, stripping the @@ -332,6 +332,7 @@ public class XmlDtd extends XmlNode public IRubyObject validate(ThreadContext context, IRubyObject doc) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods RubyArray errors = RubyArray.newArray(context.getRuntime()); if (doc instanceof XmlDocument) { errors = (RubyArray)((XmlDocument)doc).getInstanceVariable("@errors"); @@ -450,7 +451,7 @@ public class XmlDtd extends XmlNode * The node is either the first child of the root dtd * node (as returned by getInternalSubset()) or the first child of * the external subset node (as returned by getExternalSubset()). - * + *

* This recursive function will not descend into an * 'externalSubset' node, thus for an internal subset it only * extracts nodes in the internal subset, and for an external @@ -460,7 +461,7 @@ public class XmlDtd extends XmlNode protected IRubyObject[] extractDecls(ThreadContext context, Node node) { - List decls = new ArrayList(); + List decls = new ArrayList<>(); while (node != null) { if (isExternalSubset(node)) { break; diff --git a/ext/java/nokogiri/XmlElement.java b/ext/java/nokogiri/XmlElement.java index b8e3225edc..5bcb76f754 100644 --- a/ext/java/nokogiri/XmlElement.java +++ b/ext/java/nokogiri/XmlElement.java @@ -4,7 +4,6 @@ import org.jruby.RubyClass; import org.jruby.anno.JRubyClass; import org.jruby.runtime.ThreadContext; -import org.jruby.runtime.builtin.IRubyObject; import org.w3c.dom.Element; import org.w3c.dom.Node; @@ -27,6 +26,8 @@ public class XmlElement extends XmlNode super(runtime, klazz); } + // unused + @Deprecated public XmlElement(Ruby runtime, RubyClass klazz, Node element) { diff --git a/ext/java/nokogiri/XmlElementContent.java b/ext/java/nokogiri/XmlElementContent.java index 501a4557be..d2921467e5 100644 --- a/ext/java/nokogiri/XmlElementContent.java +++ b/ext/java/nokogiri/XmlElementContent.java @@ -325,7 +325,7 @@ public IRubyObject value(Ruby runtime) * moves to the parent of previous sibling). The null position is * used to indicate the end of a list. */ - protected static class NodeIter + public static class NodeIter { protected Node pre; protected Node cur; diff --git a/ext/java/nokogiri/XmlElementDecl.java b/ext/java/nokogiri/XmlElementDecl.java index bd4bc8cf25..a67b4a75ed 100644 --- a/ext/java/nokogiri/XmlElementDecl.java +++ b/ext/java/nokogiri/XmlElementDecl.java @@ -31,6 +31,7 @@ public class XmlElementDecl extends XmlNode XmlElementDecl(Ruby runtime, RubyClass klazz) { super(runtime, klazz); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods attrDecls = RubyArray.newArray(runtime); contentModel = runtime.getNil(); } @@ -49,6 +50,7 @@ public class XmlElementDecl extends XmlNode setNode(Ruby runtime, Node node) { super.setNode(runtime, node); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods attrDecls = RubyArray.newArray(runtime); contentModel = runtime.getNil(); } @@ -136,6 +138,7 @@ public class XmlElementDecl extends XmlNode public void appendAttrDecl(XmlAttributeDecl decl) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods attrDecls.append(decl); } diff --git a/ext/java/nokogiri/XmlEntityReference.java b/ext/java/nokogiri/XmlEntityReference.java index 77db5ddb2c..c32ae5ba59 100644 --- a/ext/java/nokogiri/XmlEntityReference.java +++ b/ext/java/nokogiri/XmlEntityReference.java @@ -1,6 +1,5 @@ package nokogiri; -import static nokogiri.internals.NokogiriHelpers.getCachedNodeOrCreate; import static nokogiri.internals.NokogiriHelpers.rubyStringToString; import nokogiri.internals.SaveContextVisitor; @@ -31,6 +30,8 @@ public class XmlEntityReference extends XmlNode super(ruby, klazz); } + // unused + @Deprecated public XmlEntityReference(Ruby ruby, RubyClass klass, Node node) { @@ -41,6 +42,7 @@ public class XmlEntityReference extends XmlNode init(ThreadContext context, IRubyObject[] args) { if (args.length < 2) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newArgumentError(args.length, 2); } diff --git a/ext/java/nokogiri/XmlNamespace.java b/ext/java/nokogiri/XmlNamespace.java index b2804bb7bc..030cfac529 100644 --- a/ext/java/nokogiri/XmlNamespace.java +++ b/ext/java/nokogiri/XmlNamespace.java @@ -106,7 +106,8 @@ public class XmlNamespace extends RubyObject Document document = owner.getOwnerDocument(); XmlDocument xmlDocument = (XmlDocument) getCachedNodeOrCreate(runtime, document); - assert xmlDocument.getNamespaceCache().get(prefixStr, hrefStr) == null; + XmlNamespace cachedNamespace = xmlDocument.getNamespaceCache().get(prefixStr, hrefStr); + assert cachedNamespace == null; // creating XmlNamespace instance String attrName = "xmlns"; diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java index c74650cd6b..3386301688 100644 --- a/ext/java/nokogiri/XmlNode.java +++ b/ext/java/nokogiri/XmlNode.java @@ -1,10 +1,7 @@ package nokogiri; -import static java.lang.Math.max; import static nokogiri.internals.NokogiriHelpers.*; -import java.io.ByteArrayInputStream; -import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.*; @@ -16,12 +13,10 @@ import org.jruby.RubyClass; import org.jruby.RubyFixnum; import org.jruby.RubyInteger; -import org.jruby.RubyModule; import org.jruby.RubyObject; import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; -import org.jruby.exceptions.RaiseException; import org.jruby.runtime.Block; import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; @@ -89,6 +84,7 @@ public class XmlNode extends RubyObject { if (!(node instanceof XmlNode)) { final Ruby runtime = context.runtime; + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw runtime.newTypeError(node == null ? runtime.getNil() : node, getNokogiriClass(runtime, "Nokogiri::XML::Node")); } return (XmlNode) node; @@ -107,7 +103,7 @@ public class XmlNode extends RubyObject /** * Coalesce to adjacent TextNodes. - * @param context + * @param context The current context * @param prev Previous node to cur. * @param cur Next node to prev. */ @@ -132,13 +128,14 @@ public class XmlNode extends RubyObject * are text nodes, the content will be merged into * anchorNode and the redundant nodes will be removed * from the DOM. - * + *

* To match libxml behavior (?) the final content of * anchorNode and any removed nodes will be * identical. * - * @param context - * @param anchorNode + * @param context the current context + * @param anchorNode the anchor node + * @param scheme the scheme */ protected static void coalesceTextNodes(ThreadContext context, @@ -230,18 +227,18 @@ public class XmlNode extends RubyObject * object as the only argument. If cls is * Nokogiri::XML::Node, creates a new Nokogiri::XML::Element * instead. - * + *

* This static method seems to be inherited, strangely enough. * E.g. creating a new XmlAttr from Ruby code calls this method if * XmlAttr does not define its own 'new' method. - * + *

* Since there is some Java bookkeeping that always needs to * happen, we don't define the 'initialize' method in Java because * we'd have to count on subclasses calling 'super'. - * + *

* The main consequence of this is that every subclass needs to * define its own 'new' method. - * + *

* As a convenience, this method does the following: * *

    @@ -269,10 +266,12 @@ public class XmlNode extends RubyObject Ruby ruby = context.runtime; RubyClass klazz = (RubyClass) cls; + // TODO: switch to common undeprecated API when 9.4 adds 10 methods if ("Nokogiri::XML::Node".equals(klazz.getName())) { klazz = getNokogiriClass(ruby, "Nokogiri::XML::Element"); } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods XmlNode xmlNode = (XmlNode) klazz.allocate(); xmlNode.init(context, args); xmlNode.callInit(args, block); @@ -289,7 +288,7 @@ public class XmlNode extends RubyObject * interact means that subclasses cannot arbitrarily change the * require arguments by defining an 'initialize' method. This is * how the C libxml wrapper works also. - * + *

    * As written it performs initialization for a new Element with * the given name within the document * doc. So XmlElement need not override this. This @@ -301,6 +300,7 @@ public class XmlNode extends RubyObject init(ThreadContext context, IRubyObject[] args) { if (args.length < 2) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newArgumentError(args.length, 2); } @@ -366,17 +366,19 @@ public class XmlNode extends RubyObject public boolean isElement() { - if (node instanceof Element) { return true; } // in case of subclassing - else { return false; } + // in case of subclassing + return node instanceof Element; } + // unused + @Deprecated public boolean isProcessingInstruction() { return false; } /** * Return the string value of the attribute key or * nil. - * + *

    * Only applies where the underlying Node is an Element node, but * implemented here in XmlNode because not all nodes with * underlying Element nodes subclass XmlElement, such as the DTD @@ -401,7 +403,7 @@ public class XmlNode extends RubyObject if (node.getNodeType() != Node.ELEMENT_NODE) { return null; } String value = ((Element)node).getAttribute(key); - return value.length() == 0 ? null : value; + return value.isEmpty() ? null : value; } /** @@ -438,25 +440,24 @@ public class XmlNode extends RubyObject if (ns_inherit.isTrue()) { set_namespace(context, ((XmlNode)parent(context)).namespace(context)); } - return; - } - - String currentPrefix = e.getParentNode().lookupPrefix(nsURI); - String currentURI = e.getParentNode().lookupNamespaceURI(prefix); - boolean isDefault = e.getParentNode().isDefaultNamespace(nsURI); - - // add xmlns attribute if this is a new root node or if the node's - // namespace isn't a default namespace in the new document - if (e.getParentNode().getNodeType() == Node.DOCUMENT_NODE) { - // this is the root node, so we must set the namespaces attributes anyway - e.setAttribute(prefix == null ? "xmlns" : "xmlns:" + prefix, nsURI); - } else if (prefix == null) { - // this is a default namespace but isn't the default where this node is being added - if (!isDefault) { e.setAttribute("xmlns", nsURI); } - } else if (!prefix.equals(currentPrefix) || nsURI.equals(currentURI)) { - // this is a prefixed namespace - // but doesn't have the same prefix or the prefix is set to a different URI - e.setAttribute("xmlns:" + prefix, nsURI); + } else { + String currentPrefix = e.getParentNode().lookupPrefix(nsURI); + String currentURI = e.getParentNode().lookupNamespaceURI(prefix); + boolean isDefault = e.getParentNode().isDefaultNamespace(nsURI); + + // add xmlns attribute if this is a new root node or if the node's + // namespace isn't a default namespace in the new document + if (e.getParentNode().getNodeType() == Node.DOCUMENT_NODE) { + // this is the root node, so we must set the namespaces attributes anyway + e.setAttribute(prefix == null ? "xmlns" : "xmlns:" + prefix, nsURI); + } else if (prefix == null) { + // this is a default namespace but isn't the default where this node is being added + if (!isDefault) { e.setAttribute("xmlns", nsURI); } + } else if (!prefix.equals(currentPrefix) || !nsURI.equals(currentURI)) { + // this is a prefixed namespace + // but doesn't have the same prefix or the prefix is set to a different URI + e.setAttribute("xmlns:" + prefix, nsURI); + } } if (e.hasAttributes()) { @@ -482,7 +483,7 @@ public class XmlNode extends RubyObject nsUri = null; } - if (!(nsUri == null || "".equals(nsUri) || "http://www.w3.org/XML/1998/namespace".equals(nsUri))) { + if (!(nsUri == null || nsUri.isEmpty() || "http://www.w3.org/XML/1998/namespace".equals(nsUri))) { // Create a new namespace object and add it to the document namespace cache. // TODO: why do we need the namespace cache ? XmlNamespace.createFromAttr(context.runtime, attr); @@ -491,6 +492,32 @@ public class XmlNode extends RubyObject } } + // if this namespace is a duplicate of what's already in the document, remove it. + // a "duplicate" here is if the prefix and the URI both match what resolves in the parent. + if (e.getParentNode().getNodeType() == Node.ELEMENT_NODE) { + RubyArray nsdefs = this.namespace_definitions(context); + for (int j = 0 ; j < nsdefs.getLength() ; j++) { + XmlNamespace ns = (XmlNamespace)nsdefs.get(j); + + String selfPrefix = ns.getPrefix(); + String selfURI = ns.getHref(); + String parentPrefix = e.getParentNode().lookupPrefix(selfURI); + String parentURI = e.getParentNode().lookupNamespaceURI(selfPrefix); + + boolean prefixMatch = ((selfPrefix == null && parentPrefix == null) || + (selfPrefix != null && selfPrefix.equals(parentPrefix))); + boolean uriMatch = ((selfURI == null && parentURI == null) || + (selfURI != null && selfURI.equals(parentURI))); + + if (prefixMatch && uriMatch) { + String attrName = "xmlns"; + if (selfPrefix != null && !selfPrefix.isEmpty()) { attrName = attrName + ':' + selfPrefix; } + + e.removeAttribute(attrName); + } + } + } + if (this.node.hasChildNodes()) { relink_namespace(context, getChildren()); } @@ -499,9 +526,9 @@ public class XmlNode extends RubyObject static void relink_namespace(ThreadContext context, IRubyObject[] nodes) { - for (int i = 0; i < nodes.length; i++) { - if (nodes[i] instanceof XmlNode) { - ((XmlNode) nodes[i]).relink_namespace(context); + for (IRubyObject iRubyObject : nodes) { + if (iRubyObject instanceof XmlNode) { + ((XmlNode) iRubyObject).relink_namespace(context); } } } @@ -519,8 +546,7 @@ public class XmlNode extends RubyObject acceptChildren(ThreadContext context, IRubyObject[] nodes, SaveContextVisitor visitor) { if (nodes.length > 0) { - for (int i = 0; i < nodes.length; i++) { - Object item = nodes[i]; + for (Object item : nodes) { if (item instanceof XmlNode) { ((XmlNode) item).accept(context, visitor); } else if (item instanceof XmlNamespace) { @@ -652,7 +678,7 @@ public class XmlNode extends RubyObject Node attribute = attributes.item(j); String localName = attribute.getLocalName(); if (localName == null) { - continue; + localName = attribute.getNodeName(); } if (localName.equals(name)) { return getCachedNodeOrCreate(context.runtime, attribute); @@ -669,12 +695,14 @@ public class XmlNode extends RubyObject NamedNodeMap nodeMap = this.node.getAttributes(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods if (nodeMap == null) { return runtime.newEmptyArray(); } RubyArray attr = runtime.newArray(nodeMap.getLength()); final XmlDocument doc = document(context.runtime); for (int i = 0; i < nodeMap.getLength(); i++) { if ((doc instanceof Html4Document) || !NokogiriHelpers.isNamespace(nodeMap.item(i))) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods attr.append(getCachedNodeOrCreate(runtime, nodeMap.item(i))); } } @@ -742,7 +770,7 @@ public class XmlNode extends RubyObject first_element_child(ThreadContext context) { List elementNodes = getElements(node, true); - if (elementNodes.size() == 0) { return context.nil; } + if (elementNodes.isEmpty()) { return context.nil; } return getCachedNodeOrCreate(context.runtime, elementNodes.get(0)); } @@ -751,7 +779,7 @@ public class XmlNode extends RubyObject last_element_child(ThreadContext context) { List elementNodes = getElements(node, false); - if (elementNodes.size() == 0) { return context.nil; } + if (elementNodes.isEmpty()) { return context.nil; } return getCachedNodeOrCreate(context.runtime, elementNodes.get(elementNodes.size() - 1)); } @@ -771,7 +799,7 @@ public class XmlNode extends RubyObject if (children.getLength() == 0) { return Collections.emptyList(); } - ArrayList elements = new ArrayList(); + ArrayList elements = new ArrayList<>(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); if (child.getNodeType() == Node.ELEMENT_NODE) { @@ -833,7 +861,6 @@ public class XmlNode extends RubyObject { RubyClass klass; XmlDomParserContext ctx; - InputStream istream; final Ruby runtime = context.runtime; @@ -864,6 +891,7 @@ public class XmlNode extends RubyObject RubyArray docErrors = getErrors(doc); if (checkNewErrors(documentErrors, docErrors)) { for (int i = 0; i < docErrors.getLength(); i++) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods documentErrors.append(docErrors.entry(i)); } document.setInstanceVariable("@errors", documentErrors); @@ -887,14 +915,15 @@ public class XmlNode extends RubyObject getErrors(XmlDocument document) { IRubyObject obj = document.getInstanceVariable("@errors"); - if (obj instanceof RubyArray) { return (RubyArray) obj; } + if (obj instanceof RubyArray) { return (RubyArray) obj; } return RubyArray.newEmptyArray(document.getRuntime()); } private static boolean checkNewErrors(RubyArray baseErrors, RubyArray newErrors) { - int length = ((RubyArray) newErrors.op_diff(baseErrors)).size(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods + int length = ((RubyArray) newErrors.op_diff(baseErrors)).size(); return length > 0; } @@ -980,6 +1009,7 @@ public class XmlNode extends RubyObject public IRubyObject initialize_copy_with_args(ThreadContext context, IRubyObject other, IRubyObject level, IRubyObject document) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods boolean deep = level instanceof RubyInteger && RubyFixnum.fix2int(level) != 0; this.node = asXmlNode(context, other).node.cloneNode(deep); setDocument(context, (XmlDocument)document); @@ -1049,8 +1079,7 @@ public class XmlNode extends RubyObject XmlDocument xdoc = (XmlDocument) getCachedNodeOrCreate(context.getRuntime(), document); - IRubyObject xdtd = xdoc.getInternalSubset(context); - return xdtd; + return xdoc.getInternalSubset(context); } @JRubyMethod @@ -1072,9 +1101,8 @@ public class XmlNode extends RubyObject XmlDocument xdoc = (XmlDocument) getCachedNodeOrCreate(context.getRuntime(), document); - IRubyObject xdtd = xdoc.createInternalSubset(context, name, + return xdoc.createInternalSubset(context, name, external_id, system_id); - return xdtd; } @JRubyMethod @@ -1089,8 +1117,7 @@ public class XmlNode extends RubyObject XmlDocument xdoc = (XmlDocument) getCachedNodeOrCreate(context.getRuntime(), document); - IRubyObject xdtd = xdoc.getExternalSubset(context); - return xdtd; + return xdoc.getExternalSubset(context); } @JRubyMethod @@ -1110,8 +1137,7 @@ public class XmlNode extends RubyObject return context.getRuntime().getNil(); } XmlDocument xdoc = (XmlDocument) getCachedNodeOrCreate(context.getRuntime(), document); - IRubyObject xdtd = xdoc.createExternalSubset(context, name, external_id, system_id); - return xdtd; + return xdoc.createExternalSubset(context, name, external_id, system_id); } /** @@ -1175,28 +1201,23 @@ public class XmlNode extends RubyObject public RubyArray namespace_definitions(ThreadContext context) { - // don't use namespace_definitions cache anymore since - // namespaces might be deleted. Reflecting the result of - // namespace removals is complicated, so the cache might not be - // updated. + // don't use namespace_definitions cache anymore since namespaces might be deleted. Reflecting + // the result of namespace removals is complicated, so the cache might not be updated. final XmlDocument doc = document(context.runtime); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods if (doc == null) { return context.runtime.newEmptyArray(); } if (doc instanceof Html4Document) { return context.runtime.newEmptyArray(); } - List namespaces = doc.getNamespaceCache().get(node); - return RubyArray.newArray(context.runtime, namespaces); - - // // TODO: I think this implementation would be better but there are edge cases - // // See https://github.com/sparklemotion/nokogiri/issues/2543 - // RubyArray nsdefs = RubyArray.newArray(context.getRuntime()); - // NamedNodeMap attrs = node.getAttributes(); - // for (int j = 0 ; j < attrs.getLength() ; j++) { - // Attr attr = (Attr)attrs.item(j); - // if ("http://www.w3.org/2000/xmlns/" == attr.getNamespaceURI()) { - // nsdefs.append(XmlNamespace.createFromAttr(context.getRuntime(), attr)); - // } - // } - // return nsdefs; + RubyArray nsdefs = RubyArray.newArray(context.getRuntime()); + NamedNodeMap attrs = node.getAttributes(); + for (int j = 0 ; j < attrs.getLength() ; j++) { + Attr attr = (Attr)attrs.item(j); + if ("http://www.w3.org/2000/xmlns/".equals(attr.getNamespaceURI())) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods + nsdefs.append(XmlNamespace.createFromAttr(context.getRuntime(), attr)); + } + } + return nsdefs; } /** @@ -1208,6 +1229,7 @@ public class XmlNode extends RubyObject namespace_scopes(ThreadContext context) { final XmlDocument doc = document(context.runtime); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods if (doc == null) { return context.runtime.newEmptyArray(); } if (doc instanceof Html4Document) { return context.runtime.newEmptyArray(); } @@ -1219,6 +1241,8 @@ public class XmlNode extends RubyObject } else { previousNode = findPreviousElement(node); } + + // TODO: switch to common undeprecated API when 9.4 adds 10 methods if (previousNode == null) { return context.runtime.newEmptyArray(); } final RubyArray scoped_namespaces = context.runtime.newArray(); @@ -1228,6 +1252,7 @@ public class XmlNode extends RubyObject List namespaces = nsCache.get(previous); for (XmlNamespace namespace : namespaces) { if (prefixes_in_scope.contains(namespace.getPrefix())) { continue; } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods scoped_namespaces.append(namespace); prefixes_in_scope.add(namespace.getPrefix()); } @@ -1261,7 +1286,7 @@ public class XmlNode extends RubyObject { String javaContent = rubyStringToString(content); node.setTextContent(javaContent); - if (javaContent == null || javaContent.length() == 0) { return; } + if (javaContent == null || javaContent.isEmpty()) { return; } if (node.getNodeType() == Node.TEXT_NODE || node.getNodeType() == Node.CDATA_SECTION_NODE) { return; } if (node.getFirstChild() != null) { node.getFirstChild().setUserData(NokogiriHelpers.ENCODED_STRING, true, null); @@ -1321,6 +1346,7 @@ public class XmlNode extends RubyObject IRubyObject encoding = args[1]; IRubyObject indentString = args[2]; IRubyObject options_rb = args[3]; + // TODO: switch to common undeprecated API when 9.4 adds 10 methods int options = RubyFixnum.fix2int(options_rb); String encString = rubyStringToString(encoding); @@ -1361,8 +1387,7 @@ public class XmlNode extends RubyObject isFragment() { if (node instanceof DocumentFragment) { return true; } - if (node.getParentNode() != null && node.getParentNode() instanceof DocumentFragment) { return true; } - return false; + return node.getParentNode() != null && node.getParentNode() instanceof DocumentFragment; } @JRubyMethod(name = {"next_sibling", "next"}) @@ -1588,6 +1613,7 @@ public class XmlNode extends RubyObject return context.runtime.newFixnum(0); } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods return getNokogiriClass(context.runtime, "Nokogiri::XML::Node").getConstant(type); } @@ -1837,7 +1863,7 @@ protected enum AdoptScheme { parentNode.replaceChild(otherNode, thisNode); } catch (Exception e) { String prefix = "could not replace child: "; - throw context.runtime.newRuntimeError(prefix + e.toString()); + throw context.runtime.newRuntimeError(prefix + e); } } @@ -1894,7 +1920,7 @@ protected enum AdoptScheme { process_xincludes(ThreadContext context, IRubyObject options) { XmlDocument xmlDocument = (XmlDocument)document(context); - RubyArray errors = (RubyArray)xmlDocument.getInstanceVariable("@errors"); + RubyArray errors = (RubyArray)xmlDocument.getInstanceVariable("@errors"); while (errors.getLength() > 0) { XmlSyntaxError error = (XmlSyntaxError)errors.shift(context); if (error.toString().contains("Include operation failed")) { diff --git a/ext/java/nokogiri/XmlNodeSet.java b/ext/java/nokogiri/XmlNodeSet.java index bdb5a06e0e..080ae6eda7 100644 --- a/ext/java/nokogiri/XmlNodeSet.java +++ b/ext/java/nokogiri/XmlNodeSet.java @@ -2,7 +2,6 @@ import static nokogiri.XmlNode.setDocumentAndDecorate; import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; -import static nokogiri.internals.NokogiriHelpers.nodeListToRubyArray; import java.util.Arrays; @@ -151,11 +150,9 @@ public class XmlNodeSet extends RubyObject implements NodeList int last = 0; outer: - for (int i = 0; i < curr.length; i++) { - IRubyObject n = curr[i]; - - for (int j = 0; j < other.length; j++) { - if (other[j] == n) { + for (IRubyObject n : curr) { + for (IRubyObject iRubyObject : other) { + if (iRubyObject == n) { result[last++] = n; continue outer; } @@ -182,9 +179,7 @@ public class XmlNodeSet extends RubyObject implements NodeList int last = 0; - for (int i = 0; i < orig.length; i++) { - IRubyObject n = orig[i]; - + for (IRubyObject n : orig) { if (n == nodeOrNamespace) { continue; } @@ -223,8 +218,8 @@ public class XmlNodeSet extends RubyObject implements NodeList public IRubyObject include_p(ThreadContext context, IRubyObject node_or_namespace) { - for (int i = 0; i < nodes.length; i++) { - if (nodes[i] == node_or_namespace) { + for (IRubyObject node : nodes) { + if (node == node_or_namespace) { return context.tru; } } @@ -259,11 +254,9 @@ public class XmlNodeSet extends RubyObject implements NodeList int last = 0; outer: - for (int i = 0; i < curr.length; i++) { - IRubyObject n = curr[i]; - - for (int j = 0; j < other.length; j++) { - if (other[j] == n) { + for (IRubyObject n : curr) { + for (IRubyObject iRubyObject : other) { + if (iRubyObject == n) { continue outer; } } @@ -283,6 +276,8 @@ public class XmlNodeSet extends RubyObject implements NodeList IRubyObject[] otherNodes = getNodes(context, nodeSet); if (nodes.length == 0) { + // TODO: switch to interface method when it has been in 9.4 for a year. + // The "useless" cast here on JRuby 10 is necessary on 9.4 for now. return ((XmlNodeSet) nodeSet).dup(context); } @@ -296,11 +291,9 @@ public class XmlNodeSet extends RubyObject implements NodeList int last = curr.length; outer: - for (int i = 0; i < other.length; i++) { - IRubyObject n = other[i]; - - for (int j = 0; j < curr.length; j++) { - if (curr[j] == n) { + for (IRubyObject n : other) { + for (IRubyObject iRubyObject : curr) { + if (iRubyObject == n) { continue outer; } } @@ -329,6 +322,7 @@ public class XmlNodeSet extends RubyObject implements NodeList rangeBeginLength(ThreadContext context, IRubyObject rangeMaybe, int len, int[] begLen) { RubyRange range = (RubyRange) rangeMaybe; + // TODO: switch to common undeprecated API when 9.4 adds 10 methods int min = range.begin(context).convertToInteger().getIntValue(); int max = range.end(context).convertToInteger().getIntValue(); @@ -358,6 +352,7 @@ public class XmlNodeSet extends RubyObject implements NodeList slice(ThreadContext context, IRubyObject indexOrRange) { if (indexOrRange instanceof RubyFixnum) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods return slice(context, ((RubyFixnum) indexOrRange).getIntValue()); } if (indexOrRange instanceof RubyRange) { @@ -367,6 +362,7 @@ public class XmlNodeSet extends RubyObject implements NodeList int max = begLen[1]; return subseq(context, min, max - min); } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("index must be an Integer or a Range"); } @@ -388,6 +384,7 @@ public class XmlNodeSet extends RubyObject implements NodeList public IRubyObject slice(ThreadContext context, IRubyObject start, IRubyObject length) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods int s = ((RubyFixnum) start).getIntValue(); int l = ((RubyFixnum) length).getIntValue(); @@ -422,6 +419,7 @@ public class XmlNodeSet extends RubyObject implements NodeList public RubyArray to_a(ThreadContext context) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods return context.runtime.newArrayNoCopy(nodes); } diff --git a/ext/java/nokogiri/XmlProcessingInstruction.java b/ext/java/nokogiri/XmlProcessingInstruction.java index fc41098b39..ab0a80ed02 100644 --- a/ext/java/nokogiri/XmlProcessingInstruction.java +++ b/ext/java/nokogiri/XmlProcessingInstruction.java @@ -65,6 +65,8 @@ public class XmlProcessingInstruction extends XmlNode return self; } + // unused + @Deprecated @Override public boolean isProcessingInstruction() { return true; } diff --git a/ext/java/nokogiri/XmlReader.java b/ext/java/nokogiri/XmlReader.java index 74ef72d631..1531b7ffdd 100644 --- a/ext/java/nokogiri/XmlReader.java +++ b/ext/java/nokogiri/XmlReader.java @@ -93,7 +93,7 @@ public class XmlReader extends RubyObject public void init(Ruby runtime) { - nodeQueue = new LinkedList(); + nodeQueue = new LinkedList<>(); nodeQueue.add(new ReaderNode.EmptyNode(runtime)); } @@ -181,8 +181,8 @@ public class XmlReader extends RubyObject ensureNodeClosed(context); if (readerNode == null) { return context.getRuntime().getNil(); } - if (!(readerNode instanceof ElementNode)) { context.getRuntime().getFalse(); } - return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren); + if (!(readerNode instanceof ElementNode)) { return context.getRuntime().getFalse(); } + return RubyBoolean.newBoolean(context, !readerNode.hasChildren); } @JRubyMethod @@ -210,6 +210,7 @@ public class XmlReader extends RubyObject "Nokogiri::XML::Reader")); reader.init(runtime); reader.setInstanceVariable("@source", args[0]); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods reader.setInstanceVariable("@errors", runtime.newArray()); IRubyObject url = context.nil; if (args.length > 1) { url = args[1]; } @@ -241,6 +242,7 @@ public class XmlReader extends RubyObject "Nokogiri::XML::Reader")); reader.init(runtime); reader.setInstanceVariable("@source", args[0]); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods reader.setInstanceVariable("@errors", runtime.newArray()); IRubyObject url = context.nil; if (args.length > 1) { url = args[1]; } @@ -280,7 +282,7 @@ public class XmlReader extends RubyObject { if (current.depth < 0) { return null; } if (!current.hasChildren) { return null; } - StringBuffer sb = new StringBuffer(); + StringBuilder sb = new StringBuilder(); for (int i = current.startOffset + 1; i <= current.endOffset - 1; i++) { sb.append(nodeQueue.get(i).getString()); } @@ -396,8 +398,9 @@ public class XmlReader extends RubyObject final ReaderNode currentNode = currentNode(); if (currentNode == null) { return runtime.getNil(); } if (currentNode.isError()) { - RubyArray errors = (RubyArray) getInstanceVariable("@errors"); + RubyArray errors = (RubyArray) getInstanceVariable("@errors"); IRubyObject error = currentNode.toSyntaxError(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods errors.append(error); setInstanceVariable("@errors", errors); @@ -492,9 +495,9 @@ private class DocumentHandler extends DefaultXMLDocumentHandler implements XMLEr startDocument(XMLLocator locator, String encoding, NamespaceContext context, Augmentations augs) { depth = 0; - langStack = new Stack(); - xmlBaseStack = new Stack(); - elementStack = new Stack(); + langStack = new Stack<>(); + xmlBaseStack = new Stack<>(); + elementStack = new Stack<>(); } @Override @@ -552,7 +555,7 @@ private class DocumentHandler extends DefaultXMLDocumentHandler implements XMLEr String qName = element.rawname; String uri = element.uri; String localName = element.localpart; - ReaderNode readerNode = ReaderNode.createElementNode(ruby, uri, localName, qName, attrs, depth, langStack, + ElementNode readerNode = ReaderNode.createElementNode(ruby, uri, localName, qName, attrs, depth, langStack, xmlBaseStack); if (!elementStack.isEmpty()) { ElementNode parent = elementStack.peek(); @@ -564,7 +567,7 @@ private class DocumentHandler extends DefaultXMLDocumentHandler implements XMLEr depth++; if (readerNode.lang != null) { langStack.push(readerNode.lang); } if (readerNode.xmlBase != null) { xmlBaseStack.push(readerNode.xmlBase); } - elementStack.push((ReaderNode.ElementNode)readerNode); + elementStack.push(readerNode); } else { readerNode.endOffset = readerNode.startOffset; readerNode.hasChildren = false; diff --git a/ext/java/nokogiri/XmlRelaxng.java b/ext/java/nokogiri/XmlRelaxng.java index eee9113a65..ed8f5a0a18 100644 --- a/ext/java/nokogiri/XmlRelaxng.java +++ b/ext/java/nokogiri/XmlRelaxng.java @@ -1,15 +1,12 @@ package nokogiri; -import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; - import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; -import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import javax.xml.transform.Source; -import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; @@ -63,6 +60,7 @@ public class XmlRelaxng extends XmlSchema parseOptions = defaultParseOptions(context.getRuntime()); } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods xmlRelaxng.setInstanceVariable("@errors", runtime.newEmptyArray()); xmlRelaxng.setInstanceVariable("@parse_options", parseOptions); @@ -89,27 +87,16 @@ public class XmlRelaxng extends XmlSchema StreamResult result = new StreamResult(xmlAsWriter); try { TransformerFactory.newInstance().newTransformer().transform(ds, result); - } catch (TransformerConfigurationException ex) { - throw context.getRuntime() - .newRuntimeError("Could not parse document: " + ex.getMessage()); } catch (TransformerException ex) { throw context.getRuntime() .newRuntimeError("Could not parse document: " + ex.getMessage()); } - try { - is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8")); - } catch (UnsupportedEncodingException ex) { - throw context.getRuntime() - .newRuntimeError("Could not parse document: " + ex.getMessage()); - } + is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes(StandardCharsets.UTF_8)); } try { return factory.compileSchema(is); - } catch (VerifierConfigurationException ex) { - throw context.getRuntime() - .newRuntimeError("Could not parse document: " + ex.getMessage()); - } catch (SAXException ex) { + } catch (VerifierConfigurationException | SAXException ex) { throw context.getRuntime() .newRuntimeError("Could not parse document: " + ex.getMessage()); } catch (IOException ex) { diff --git a/ext/java/nokogiri/XmlSaxParserContext.java b/ext/java/nokogiri/XmlSaxParserContext.java index 4c20349ea3..00568b0f75 100644 --- a/ext/java/nokogiri/XmlSaxParserContext.java +++ b/ext/java/nokogiri/XmlSaxParserContext.java @@ -1,14 +1,12 @@ package nokogiri; import nokogiri.internals.*; -import static nokogiri.internals.NokogiriHelpers.rubyStringToString; import org.apache.xerces.parsers.AbstractSAXParser; import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyEncoding; import org.jruby.RubyFixnum; -import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.exceptions.RaiseException; @@ -18,7 +16,6 @@ import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; @@ -104,7 +101,7 @@ public class XmlSaxParserContext extends ParserContext if (!(encoding instanceof RubyEncoding)) { throw context.runtime.newTypeError("encoding must be kind_of Encoding"); } - java_encoding = ((RubyEncoding)encoding).toString(); + java_encoding = encoding.toString(); } XmlSaxParserContext ctx = newInstance(context.runtime, (RubyClass) klazz); @@ -129,9 +126,10 @@ public class XmlSaxParserContext extends ParserContext String java_encoding = null; if (encoding != context.runtime.getNil()) { if (!(encoding instanceof RubyEncoding)) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("encoding must be kind_of Encoding"); } - java_encoding = ((RubyEncoding)encoding).toString(); + java_encoding = encoding.toString(); } XmlSaxParserContext ctx = newInstance(context.runtime, (RubyClass) klazz); @@ -156,15 +154,17 @@ public class XmlSaxParserContext extends ParserContext parse_io(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding) { if (!invoke(context, data, "respond_to?", context.runtime.newSymbol("read")).isTrue()) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("argument expected to respond to :read"); } String java_encoding = null; if (encoding != context.runtime.getNil()) { if (!(encoding instanceof RubyEncoding)) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError("encoding must be kind_of Encoding"); } - java_encoding = ((RubyEncoding)encoding).toString(); + java_encoding = encoding.toString(); } XmlSaxParserContext ctx = newInstance(context.runtime, (RubyClass) klazz); @@ -236,9 +236,10 @@ public class XmlSaxParserContext extends ParserContext protected static Options defaultParseOptions(ThreadContext context) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods return new ParserContext.Options( RubyFixnum.fix2long(Helpers.invoke(context, - ((RubyClass)context.getRuntime().getClassFromPath("Nokogiri::XML::ParseOptions")) + context.getRuntime().getClassFromPath("Nokogiri::XML::ParseOptions") .getConstant("DEFAULT_XML"), "to_i")) ); @@ -272,7 +273,7 @@ public class XmlSaxParserContext extends ParserContext parser.setProperty("http://xml.org/sax/properties/lexical-handler", handler); parser.setProperty("http://xml.org/sax/properties/declaration-handler", handler); } catch (Exception ex) { - throw runtime.newRuntimeError("Problem while creating XML SAX Parser: " + ex.toString()); + throw runtime.newRuntimeError("Problem while creating XML SAX Parser: " + ex); } try { diff --git a/ext/java/nokogiri/XmlSaxPushParser.java b/ext/java/nokogiri/XmlSaxPushParser.java index 26261a33e3..d6c69eb1ca 100644 --- a/ext/java/nokogiri/XmlSaxPushParser.java +++ b/ext/java/nokogiri/XmlSaxPushParser.java @@ -89,6 +89,7 @@ public class XmlSaxPushParser extends RubyObject setOptions(ThreadContext context, IRubyObject opts) { invoke(context, parse_options(context), "options=", opts); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods options = new ParserContext.Options(opts.convertToInteger().getLongValue()); return getOptions(context); } @@ -169,15 +170,12 @@ public class XmlSaxPushParser extends RubyObject assert saxParser != null : "saxParser null"; parserTask = new ParserTask(context, saxParser, stream); - futureTask = new FutureTask(parserTask); - executor = Executors.newSingleThreadExecutor(new ThreadFactory() { - @Override - public Thread newThread(Runnable r) { - Thread t = new Thread(r); - t.setName("XmlSaxPushParser"); - t.setDaemon(true); - return t; - } + futureTask = new FutureTask<>(parserTask); + executor = Executors.newSingleThreadExecutor(r -> { + Thread t = new Thread(r); + t.setName("XmlSaxPushParser"); + t.setDaemon(true); + return t; }); executor.submit(futureTask); } @@ -190,8 +188,6 @@ public Thread newThread(Runnable r) { try { terminateImpl(); - } catch (InterruptedException e) { - throw runtime.newRuntimeError(e.toString()); } catch (Exception e) { throw runtime.newRuntimeError(e.toString()); } diff --git a/ext/java/nokogiri/XmlSchema.java b/ext/java/nokogiri/XmlSchema.java index 381953c64d..d1432471e2 100644 --- a/ext/java/nokogiri/XmlSchema.java +++ b/ext/java/nokogiri/XmlSchema.java @@ -6,21 +6,17 @@ import java.io.IOException; import java.io.InputStream; import java.io.Reader; -import java.io.StringReader; import javax.xml.XMLConstants; import javax.xml.transform.Source; import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamSource; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import javax.xml.validation.Validator; -import nokogiri.internals.IgnoreSchemaErrorsErrorHandler; import nokogiri.internals.SchemaErrorHandler; import nokogiri.internals.XmlDomParserContext; import nokogiri.internals.ParserContext; -import nokogiri.internals.ParserContext.Options; import org.jruby.Ruby; import org.jruby.RubyArray; @@ -29,7 +25,6 @@ import org.jruby.RubyObject; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; -import org.jruby.exceptions.RaiseException; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.Visibility; import org.jruby.runtime.builtin.IRubyObject; @@ -106,14 +101,16 @@ public class XmlSchema extends RubyObject if (parseOptions == null) { parseOptions = defaultParseOptions(context.getRuntime()); } + // TODO: switch to common undeprecated API when 9.4 adds 10 methods long intParseOptions = RubyFixnum.fix2long(Helpers.invoke(context, parseOptions, "to_i")); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods xmlSchema.setInstanceVariable("@errors", runtime.newEmptyArray()); xmlSchema.setInstanceVariable("@parse_options", parseOptions); try { SchemaErrorHandler errorHandler = - new SchemaErrorHandler(context.getRuntime(), (RubyArray)xmlSchema.getInstanceVariable("@errors")); + new SchemaErrorHandler(context.getRuntime(), (RubyArray)xmlSchema.getInstanceVariable("@errors")); Schema schema = xmlSchema.getSchema(source, context.getRuntime().getCurrentDirectory(), @@ -130,7 +127,7 @@ public class XmlSchema extends RubyObject protected static IRubyObject defaultParseOptions(Ruby runtime) { - return ((RubyClass)runtime.getClassFromPath("Nokogiri::XML::ParseOptions")).getConstant("DEFAULT_SCHEMA"); + return runtime.getClassFromPath("Nokogiri::XML::ParseOptions").getConstant("DEFAULT_SCHEMA"); } /* @@ -151,6 +148,7 @@ public class XmlSchema extends RubyObject if (!(rbDocument instanceof XmlNode)) { String msg = "expected parameter to be a Nokogiri::XML::Document, received " + rbDocument.getMetaClass(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError(msg); } if (!(rbDocument instanceof XmlDocument)) { @@ -159,8 +157,9 @@ public class XmlSchema extends RubyObject XmlDocument doc = ((XmlDocument)((XmlNode) rbDocument).document(context)); - RubyArray errors = (RubyArray) doc.getInstanceVariable("@errors"); + RubyArray errors = (RubyArray) doc.getInstanceVariable("@errors"); if (!errors.isEmpty()) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw((XmlSyntaxError) errors.first()).toThrowable(); } @@ -178,6 +177,7 @@ public class XmlSchema extends RubyObject private static IRubyObject getSchema(ThreadContext context, RubyClass klazz, Source source, IRubyObject parseOptions) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods String moduleName = klazz.getName(); if ("Nokogiri::XML::Schema".equals(moduleName)) { return XmlSchema.createSchemaInstance(context, klazz, source, parseOptions); @@ -206,9 +206,11 @@ public class XmlSchema extends RubyObject XmlDocument xmlDocument = ctx.parse(context, getNokogiriClass(runtime, "Nokogiri::XML::Document"), context.nil); return validate_document_or_file(context, xmlDocument); } catch (Exception ex) { - RubyArray errors = (RubyArray)context.runtime.newEmptyArray(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods + RubyArray errors = context.runtime.newEmptyArray(); XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime); xmlSyntaxError.setException(ex); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods errors.append(xmlSyntaxError); return errors; } @@ -217,7 +219,8 @@ public class XmlSchema extends RubyObject IRubyObject validate_document_or_file(ThreadContext context, XmlDocument xmlDocument) { - RubyArray errors = context.runtime.newEmptyArray(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods + RubyArray errors = context.runtime.newEmptyArray(); ErrorHandler errorHandler = new SchemaErrorHandler(context.runtime, errors); setErrorHandler(errorHandler); @@ -226,6 +229,7 @@ public class XmlSchema extends RubyObject } catch (SAXException ex) { XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime); xmlSyntaxError.setException(ex); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods errors.append(xmlSyntaxError); } catch (IOException ex) { throw context.runtime.newIOError(ex.getMessage()); @@ -291,7 +295,7 @@ private class SchemaResourceResolver implements LSResourceResolver } try { this.errorHandler.warning(new SAXParseException(String.format("Attempt to load network entity '%s'", systemId), null)); - } catch (SAXException ex) { + } catch (SAXException ignored) { } } else { String adjusted = adjustSystemIdIfNecessary(currentDir, scriptFileName, baseURI, systemId); @@ -303,7 +307,7 @@ private class SchemaResourceResolver implements LSResourceResolver } } - private class SchemaLSInput implements LSInput + private static class SchemaLSInput implements LSInput { protected String fPublicId; protected String fSystemId; diff --git a/ext/java/nokogiri/XmlSyntaxError.java b/ext/java/nokogiri/XmlSyntaxError.java index 0fe1b649e4..d3265499b4 100644 --- a/ext/java/nokogiri/XmlSyntaxError.java +++ b/ext/java/nokogiri/XmlSyntaxError.java @@ -91,6 +91,8 @@ public class XmlSyntaxError extends RubyException return xmlSyntaxError; } + // unused + @Deprecated public static XmlSyntaxError createFatalError(Ruby runtime, SAXParseException e) { diff --git a/ext/java/nokogiri/XmlText.java b/ext/java/nokogiri/XmlText.java index f51bc6b91c..12f984c67c 100644 --- a/ext/java/nokogiri/XmlText.java +++ b/ext/java/nokogiri/XmlText.java @@ -47,6 +47,7 @@ public class XmlText extends XmlNode init(ThreadContext context, IRubyObject[] args) { if (args.length < 2) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newArgumentError(args.length, 2); } @@ -55,6 +56,7 @@ public class XmlText extends XmlNode if (!(rbDocument instanceof XmlNode)) { String msg = "expected second parameter to be a Nokogiri::XML::Document, received " + rbDocument.getMetaClass(); + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.runtime.newTypeError(msg); } if (!(rbDocument instanceof XmlDocument)) { diff --git a/ext/java/nokogiri/XmlXpathContext.java b/ext/java/nokogiri/XmlXpathContext.java index 5996956f68..83044d4919 100644 --- a/ext/java/nokogiri/XmlXpathContext.java +++ b/ext/java/nokogiri/XmlXpathContext.java @@ -17,7 +17,6 @@ import org.jruby.RubyObject; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; -import org.jruby.exceptions.RaiseException; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.jruby.util.SafePropertyAccessor; @@ -121,7 +120,6 @@ public class XmlXpathContext extends RubyObject while (xpathFunctionCalls.find()) { namespacedQuery.append(query.subSequence(jchar, xpathFunctionCalls.start())); - jchar = xpathFunctionCalls.start(); if (methodNames.contains(xpathFunctionCalls.group())) { namespacedQuery.append(NokogiriNamespaceContext.NOKOGIRI_PREFIX); @@ -198,7 +196,7 @@ public class XmlXpathContext extends RubyObject return tryGetNodeSet(context, expr, fnResolver); } catch (TransformerException | RuntimeException ex) { throw XmlSyntaxError.createXMLXPathSyntaxError(context.runtime, - (expr + ": " + ex.toString()), + (expr + ": " + ex), ex).toThrowable(); } } diff --git a/ext/java/nokogiri/XsltStylesheet.java b/ext/java/nokogiri/XsltStylesheet.java index 1870b60d35..c71b4750c2 100644 --- a/ext/java/nokogiri/XsltStylesheet.java +++ b/ext/java/nokogiri/XsltStylesheet.java @@ -83,8 +83,9 @@ public class XsltStylesheet extends RubyObject if (parameters instanceof RubyHash) { setHashParameters(transf, (RubyHash)parameters); } else if (parameters instanceof RubyArray) { - setArrayParameters(transf, context, (RubyArray)parameters); + setArrayParameters(transf, context, (RubyArray)parameters); } else { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw context.getRuntime().newTypeError("parameters should be given either Array or Hash"); } } @@ -170,8 +171,9 @@ public class XsltStylesheet extends RubyObject ensureDocumentHasNoError(ThreadContext context, XmlDocument xmlDoc) { Ruby runtime = context.getRuntime(); - RubyArray errors_of_xmlDoc = (RubyArray) xmlDoc.getInstanceVariable("@errors"); + RubyArray errors_of_xmlDoc = (RubyArray) xmlDoc.getInstanceVariable("@errors"); if (!errors_of_xmlDoc.isEmpty()) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods throw runtime.newRuntimeError(errors_of_xmlDoc.first().asString().asJavaString()); } } @@ -212,11 +214,7 @@ public class XsltStylesheet extends RubyObject if (result.getNode().getFirstChild() == null) { stringResult = retryXsltTransformation(context, args, domSource, elistener); // StreamResult } - } catch (TransformerConfigurationException ex) { - throw runtime.newRuntimeError(ex.getMessage()); - } catch (TransformerException ex) { - throw runtime.newRuntimeError(ex.getMessage()); - } catch (IOException ex) { + } catch (TransformerException | IOException ex) { throw runtime.newRuntimeError(ex.getMessage()); } @@ -230,7 +228,7 @@ public class XsltStylesheet extends RubyObject } if (stringResult == null) { - return createDocumentFromDomResult(context, runtime, result); + return createDocumentFromDomResult(context, result); } else { return createDocumentFromString(context, runtime, stringResult); } @@ -291,7 +289,7 @@ public class XsltStylesheet extends RubyObject } private IRubyObject - createDocumentFromDomResult(ThreadContext context, Ruby runtime, DOMResult domResult) + createDocumentFromDomResult(ThreadContext context, DOMResult domResult) { if ("html".equals(domResult.getNode().getFirstChild().getNodeName())) { return new Html4Document(context.runtime, (Document) domResult.getNode()); @@ -329,15 +327,17 @@ public class XsltStylesheet extends RubyObject args[2] = runtime.getNil(); // encoding RubyClass parse_options = (RubyClass)runtime.getClassFromPath("Nokogiri::XML::ParseOptions"); if (htmlish) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods args[3] = parse_options.getConstant("DEFAULT_HTML"); RubyClass htmlDocumentClass = getNokogiriClass(runtime, "Nokogiri::HTML4::Document"); return Helpers.invoke(context, htmlDocumentClass, "parse", args); } else { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods args[3] = parse_options.getConstant("DEFAULT_XML"); RubyClass xmlDocumentClass = getNokogiriClass(runtime, "Nokogiri::XML::Document"); XmlDocument xmlDocument = (XmlDocument) Helpers.invoke(context, xmlDocumentClass, "parse", args); if (((Document)xmlDocument.getNode()).getDocumentElement() == null) { - RubyArray errors = (RubyArray) xmlDocument.getInstanceVariable("@errors"); + RubyArray errors = (RubyArray) xmlDocument.getInstanceVariable("@errors"); Helpers.invoke(context, errors, "<<", args[0]); } return xmlDocument; diff --git a/ext/java/nokogiri/internals/NokogiriHelpers.java b/ext/java/nokogiri/internals/NokogiriHelpers.java index 97ffc50034..8dd28ee6d1 100644 --- a/ext/java/nokogiri/internals/NokogiriHelpers.java +++ b/ext/java/nokogiri/internals/NokogiriHelpers.java @@ -40,6 +40,8 @@ import nokogiri.XmlText; import nokogiri.XmlXpathContext; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * A class for various utility methods. * @@ -59,6 +61,8 @@ public class NokogiriHelpers return (XmlNode) node.getUserData(CACHED_NODE); } + // unused + @Deprecated public static void clearCachedNode(Node node) { @@ -221,7 +225,7 @@ public class NokogiriHelpers public static IRubyObject nonEmptyStringOrNil(Ruby runtime, String s) { - if (s == null || s.length() == 0) { return runtime.getNil(); } + if (s == null || s.isEmpty()) { return runtime.getNil(); } return RubyString.newString(runtime, s); } @@ -287,7 +291,7 @@ public class NokogiriHelpers Node cur, tmp, next; - String buffer = ""; + StringBuilder buffer = new StringBuilder(); cur = node; @@ -295,10 +299,10 @@ public class NokogiriHelpers String name = ""; String sep = "?"; int occur = 0; - boolean generic = false; + boolean generic; if (cur.getNodeType() == Node.DOCUMENT_NODE) { - if (buffer.startsWith("/")) { break; } + if (buffer.toString().startsWith("/")) { break; } sep = "/"; next = null; @@ -471,18 +475,20 @@ public class NokogiriHelpers } if (occur == 0) { - buffer = sep + name + buffer; + buffer.insert(0, sep + name); } else { - buffer = sep + name + "[" + occur + "]" + buffer; + buffer.insert(0, sep + name + "[" + occur + "]"); } cur = next; } while (cur != null); - return buffer; + return buffer.toString(); } + // unused + @Deprecated static boolean compareTwoNodes(Node m, Node n) { @@ -494,7 +500,7 @@ public class NokogiriHelpers nodesAreEqual(Object a, Object b) { return (((a == null) && (b == null)) || - ((a != null) && (b != null) && (b.equals(a)))); + ((b != null) && (b.equals(a)))); } private static boolean @@ -505,7 +511,7 @@ public class NokogiriHelpers private static final Pattern encoded_pattern = Pattern.compile("&|>|<| "); private static final String[] encoded = {"&", ">", "<", " "}; - private static final Pattern decoded_pattern = Pattern.compile("&|>|<|\r"); + private static final Pattern decoded_pattern = Pattern.compile("[&><\r]"); private static final String[] decoded = {"&", ">", "<", "\r"}; private static StringBuffer @@ -555,6 +561,8 @@ public class NokogiriHelpers return (nodeName.startsWith("xmlns")); } + // unused + @Deprecated public static boolean isNonDefaultNamespace(Node node) { @@ -591,6 +599,8 @@ public class NokogiriHelpers return str.isEmpty() || isBlank((CharSequence) str); } + // unused + @Deprecated public static boolean isNullOrEmpty(String str) { @@ -649,8 +659,9 @@ public class NokogiriHelpers nodeArrayToRubyArray(Ruby ruby, Node[] nodes) { RubyArray n = RubyArray.newArray(ruby, nodes.length); - for (int i = 0; i < nodes.length; i++) { - n.append(NokogiriHelpers.getCachedNodeOrCreate(ruby, nodes[i])); + for (Node node : nodes) { + // TODO: switch to common undeprecated API when 9.4 adds 10 methods + n.append(NokogiriHelpers.getCachedNodeOrCreate(ruby, node)); } return n; } @@ -690,7 +701,7 @@ public class NokogiriHelpers private static String resolveSystemId(String baseName, String systemId) { - if (baseName == null || baseName.length() < 1) { return null; } + if (baseName == null || baseName.isEmpty()) { return null; } String parentName; baseName = baseName.replace("%20", " "); File base = new File(baseName); @@ -703,15 +714,13 @@ public class NokogiriHelpers return null; } - private static final Charset UTF8 = Charset.forName("UTF-8"); - public static boolean isUTF8(String encoding) { if (encoding == null) { return true; } // no need to convert encoding if ("UTF-8".equals(encoding)) { return true; } - return UTF8.aliases().contains(encoding); + return UTF_8.aliases().contains(encoding); } public static ByteBuffer @@ -720,6 +729,8 @@ public class NokogiriHelpers return output_charset.encode(CharBuffer.wrap(input_string)); // does replace implicitly on un-mappable characters } + // unused + @Deprecated public static CharSequence convertEncodingByNKFIfNecessary(ThreadContext context, XmlDocument doc, CharSequence str) { @@ -766,15 +777,8 @@ public class NokogiriHelpers RubyString r_str = (RubyString)nkf_method.invoke(null, context, null, runtime.newString(opt), runtime.newString(str.toString())); return NokogiriHelpers.rubyStringToString(r_str); - } catch (SecurityException e) { - return str; - } catch (NoSuchMethodException e) { - return str; - } catch (IllegalArgumentException e) { - return str; - } catch (IllegalAccessException e) { - return str; - } catch (InvocationTargetException e) { + } catch (SecurityException | NoSuchMethodException | IllegalArgumentException | IllegalAccessException | + InvocationTargetException e) { return str; } } diff --git a/ext/java/nokogiri/internals/SaveContextVisitor.java b/ext/java/nokogiri/internals/SaveContextVisitor.java index 462a9bd69b..aac04a8430 100644 --- a/ext/java/nokogiri/internals/SaveContextVisitor.java +++ b/ext/java/nokogiri/internals/SaveContextVisitor.java @@ -58,6 +58,7 @@ public class SaveContextVisitor private final Deque c14nNamespaceStack; private final Deque c14nAttrStack; //private List c14nExclusiveInclusivePrefixes = null; + private final Stack> xmlnsNamespaceStack; /* * U can't touch this. @@ -117,6 +118,7 @@ public class SaveContextVisitor if ((asBuilder && indent == null) || (asBuilder && indent.length() == 0)) { indent = " "; } // default, two spaces indentString = indent; if (!asXml && !asHtml && !asXhtml && !asBuilder) { asXml = true; } + xmlnsNamespaceStack = asXml ? new Stack>() : null; } @Override @@ -432,19 +434,24 @@ public class SaveContextVisitor public boolean enter(Element element) { + pushXmlnsNamespaceStack(); + if (canonical) { c14nNodeList.add(element); if (element == element.getOwnerDocument().getDocumentElement()) { c14nNodeList.add(element.getOwnerDocument()); } } + String current = indentation.peek(); buffer.append(current); if (needIndent(element)) { indentation.push(current + indentString); } + String name = element.getTagName(); buffer.append('<').append(name); + Attr[] attrs = getAttrsAndNamespaces(element); for (Attr attr : attrs) { if (attr.getSpecified()) { @@ -453,11 +460,13 @@ public class SaveContextVisitor leave(attr); } } + if (element.hasChildNodes()) { buffer.append('>'); if (needBreakInOpening(element)) { buffer.append('\n'); } return true; } + // no child if (asHtml) { buffer.append('>'); @@ -472,12 +481,38 @@ public class SaveContextVisitor } else { buffer.append("/>"); } + if (needBreakInOpening(element)) { buffer.append('\n'); } return true; } + private Map + pushXmlnsNamespaceStack() { + if (!asXml || xmlnsNamespaceStack == null) { return null; } + Map newContext; + if (xmlnsNamespaceStack.isEmpty()) { + newContext = new HashMap(); + } else { + Map parentContext = xmlnsNamespaceStack.peek(); + newContext = new HashMap(parentContext); + } + return xmlnsNamespaceStack.push(newContext); + } + + private Map + popXmlnsNamespaceStack() { + if (!asXml || xmlnsNamespaceStack == null || xmlnsNamespaceStack.isEmpty()) { return null; } + return xmlnsNamespaceStack.pop(); + } + + private Map + peekXmlnsNamespaceStack() { + if (!asXml || xmlnsNamespaceStack == null || xmlnsNamespaceStack.isEmpty()) { return null; } + return xmlnsNamespaceStack.peek(); + } + private boolean needIndent(Element element) { @@ -511,11 +546,15 @@ public class SaveContextVisitor NamedNodeMap attrs = element.getAttributes(); if (!canonical) { if (attrs == null || attrs.getLength() == 0) { return new Attr[0]; } - Attr[] attrsAndNamespaces = new Attr[attrs.getLength()]; + Map xmlnsContext = peekXmlnsNamespaceStack(); + List filteredAttrsAndNamespaces = new ArrayList(); for (int i = 0; i < attrs.getLength(); i++) { - attrsAndNamespaces[i] = (Attr) attrs.item(i); + Attr attr = (Attr) attrs.item(i); + if (!findOrAddRedundantNamespaceAttr(xmlnsContext, attr)) { + filteredAttrsAndNamespaces.add(attr); + } } - return attrsAndNamespaces; + return filteredAttrsAndNamespaces.toArray(new Attr[0]); } else { List namespaces = new ArrayList(); List attributes = new ArrayList(); @@ -544,7 +583,45 @@ public class SaveContextVisitor c14nAttrStack.push(attributeArray); return allAttrs; } + } + + /** + * Detects whether a given attribute is a redundant xmlns namespace + * already present within xmlnsContext. + * + * As a side-effect, if the attribute is a non-redundant namespace, + * it is added to the xmlnsContext, so that it will be considered redundant + * for subsequent checks. + * + * @param xmlnsContext The namespace context, which should be the top object + * of xmlnsNamespaceStack. + * @param attr The attribute to check. + * @return True if the object is redundant, false otherwise. + */ + private boolean + findOrAddRedundantNamespaceAttr(Map xmlnsContext, Attr attr) { + if (xmlnsContext == null || !attr.getSpecified()) { return false; } + + String xmlnsPrefix; + String attrName = attr.getNodeName(); + if (attrName.equals("xmlns")) { + xmlnsPrefix = ""; + } else if (attrName.startsWith("xmlns:")) { + xmlnsPrefix = attrName.substring(6); + } else { + // Not a namespace attribute + return false; + } + String xmlnsUri = attr.getNodeValue(); + if (xmlnsContext.containsKey(xmlnsPrefix) && xmlnsUri.equals(xmlnsContext.get(xmlnsPrefix))) { + // Redundant namespace detected + return true; + } else { + // Add non-redundant namespace to the top of xmlnsNamespaceStack + xmlnsContext.put(xmlnsPrefix, xmlnsUri); + return false; + } } private void @@ -653,6 +730,8 @@ public int compare(Attr attr0, Attr attr1) { public void leave(Element element) { + popXmlnsNamespaceStack(); + if (canonical) { c14nNamespaceStack.poll(); c14nAttrStack.poll(); diff --git a/ext/nokogiri/extconf.rb b/ext/nokogiri/extconf.rb index 70e2fe0cc9..2960367e6b 100644 --- a/ext/nokogiri/extconf.rb +++ b/ext/nokogiri/extconf.rb @@ -240,15 +240,7 @@ def zlib_source(version_string) end def gnome_source - # As of 2022-02-20, some mirrors have expired SSL certificates. I'm able to retrieve from my home, - # but whatever host is resolved on the github actions workers see an expired cert. - # - # See https://github.com/sparklemotion/nokogiri/runs/5266206403?check_suite_focus=true - if ENV["NOKOGIRI_USE_CANONICAL_GNOME_SOURCE"] - "https://download.gnome.org" - else - "https://muug.ca/mirror/gnome" # old reliable - end + "https://download.gnome.org" end LOCAL_PACKAGE_RESPONSE = Object.new @@ -469,7 +461,7 @@ def recipe.port_path yield recipe env = Hash.new do |hash, key| - hash[key] = (ENV[key]).to_s + hash[key] = ENV[key].to_s end recipe.configure_options.flatten! @@ -627,7 +619,7 @@ def do_clean def needs_darwin_linker_hack config_cross_build? && darwin? && - Gem::Requirement.new("~> 3.2").satisfied_by?(Gem::Version.new(RbConfig::CONFIG["ruby_version"].split("+").first)) + RbConfig::MAKEFILE_CONFIG["EXTDLDFLAGS"].include?("-bundle_loader") end # @@ -764,10 +756,6 @@ def needs_darwin_linker_hack cross_build_p = config_cross_build? message "Cross build is #{cross_build_p ? "enabled" : "disabled"}.\n" - if needs_darwin_linker_hack - append_ldflags("-Wl,-flat_namespace") - end - require "yaml" dependencies = YAML.load_file(File.join(PACKAGE_ROOT_DIR, "dependencies.yml")) @@ -1133,6 +1121,7 @@ def compile have_func("xmlCtxtSetOptions") # introduced in libxml2 2.13.0 have_func("xmlCtxtGetOptions") # introduced in libxml2 2.14.0 have_func("xmlSwitchEncodingName") # introduced in libxml2 2.13.0 +have_func("xmlAddIDSafe") # introduced in libxml2 2.13.0 have_func("rb_category_warning") # introduced in Ruby 3.0 but had trouble resolving this symbol in truffleruby other_library_versions_string = OTHER_LIBRARY_VERSIONS.map { |k, v| [k, v].join(":") }.join(",") diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c index fd938f3c0c..e3ed40a7ca 100644 --- a/ext/nokogiri/gumbo.c +++ b/ext/nokogiri/gumbo.c @@ -113,6 +113,71 @@ set_line(xmlNodePtr node, size_t line) } } +// This function is essentially xmlNewNsProp, but we skip the full list traversal to append by +// providing the last property in the linked list as a parameter. +static xmlAttrPtr +append_property(xmlNodePtr node, xmlNsPtr ns, const xmlChar *name, const xmlChar *value, xmlAttrPtr last_prop) +{ + xmlAttrPtr cur = (xmlAttrPtr) xmlMalloc(sizeof(xmlAttr)); + if (cur == NULL) { + return NULL; + } + memset(cur, 0, sizeof(xmlAttr)); + cur->type = XML_ATTRIBUTE_NODE; + cur->parent = node; + xmlDocPtr doc = node->doc; + cur->doc = doc; + cur->ns = ns; + + if ((doc != NULL) && (doc->dict != NULL)) { + cur->name = (xmlChar *) xmlDictLookup(doc->dict, name, -1); + } else { + cur->name = xmlStrdup(name); + } + if (cur->name == NULL) { + goto error; + } + + if (value != NULL) { + cur->children = xmlNewDocText(doc, value); + if (cur->children == NULL) { + goto error; + } + cur->last = NULL; + xmlNodePtr tmp = cur->children; + while (tmp != NULL) { + tmp->parent = (xmlNodePtr) cur; + if (tmp->next == NULL) { + cur->last = tmp; + } + tmp = tmp->next; + } + + if (doc != NULL) { + int res = xmlIsID(doc, node, cur); + if (res < 0) { + goto error; + } + if ((res == 1) && (xmlAddIDSafe(cur, value) < 0)) { + goto error; + } + } + } + + if (node->properties == NULL) { + node->properties = cur; + } else { + last_prop->next = cur; + cur->prev = last_prop; + } + + return cur; + +error: + xmlFreeProp(cur); + return (NULL); +} + // Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted // at gumbo_node. static void @@ -200,6 +265,7 @@ build_tree( xmlAddChild(xml_node, xml_child); // Add the attributes. + xmlAttrPtr last_prop = NULL; const GumboVector *attrs = &gumbo_child->v.element.attributes; for (size_t i = 0; i < attrs->length; i++) { const GumboAttribute *attr = attrs->data[i]; @@ -220,7 +286,9 @@ build_tree( default: ns = NULL; } - xmlNewNsProp(xml_child, ns, (const xmlChar *)attr->name, (const xmlChar *)attr->value); + + // We micromanage the attribute list for performance reasons. + last_prop = append_property(xml_child, ns, (const xmlChar *)attr->name, (const xmlChar *)attr->value, last_prop); } // Add children for this element. diff --git a/ext/nokogiri/libxml2_polyfill.c b/ext/nokogiri/libxml2_polyfill.c index 750b1b52a2..2b4296cf32 100644 --- a/ext/nokogiri/libxml2_polyfill.c +++ b/ext/nokogiri/libxml2_polyfill.c @@ -112,3 +112,15 @@ xmlSwitchEncodingName(xmlParserCtxtPtr ctxt, const char *encoding) return (xmlSwitchToEncoding(ctxt, handler)); } #endif + +#ifndef HAVE_XMLADDIDSAFE +int +xmlAddIDSafe(xmlAttrPtr attr, const xmlChar *value) +{ + xmlIDPtr id = xmlAddID(NULL, attr->doc, value, attr); + if (id) { + return 1; + } + return 0; +} +#endif diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index b75ebc47fa..99d38e1183 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -66,6 +66,9 @@ int xmlCtxtGetOptions(xmlParserCtxtPtr ctxt); #ifndef HAVE_XMLSWITCHENCODINGNAME int xmlSwitchEncodingName(xmlParserCtxtPtr ctxt, const char *encoding); #endif +#ifndef HAVE_XMLADDIDSAFE +int xmlAddIDSafe(xmlAttrPtr attr, const xmlChar *value); +#endif #define XMLNS_PREFIX "xmlns" #define XMLNS_PREFIX_LEN 6 /* including either colon or \0 */ diff --git a/ext/nokogiri/xml_sax_parser_context.c b/ext/nokogiri/xml_sax_parser_context.c index 75fe2e4f01..0d2b65b599 100644 --- a/ext/nokogiri/xml_sax_parser_context.c +++ b/ext/nokogiri/xml_sax_parser_context.c @@ -102,7 +102,10 @@ noko_xml_sax_parser_context_s_native_io(VALUE rb_class, VALUE rb_io, VALUE rb_en c_context->sax = NULL; } - return noko_xml_sax_parser_context_wrap(rb_class, c_context); + VALUE rb_context = noko_xml_sax_parser_context_wrap(rb_class, c_context); + rb_iv_set(rb_context, "@input", rb_io); + + return rb_context; } /* :nodoc: */ @@ -154,7 +157,10 @@ noko_xml_sax_parser_context_s_native_memory(VALUE rb_class, VALUE rb_input, VALU c_context->sax = NULL; } - return noko_xml_sax_parser_context_wrap(rb_class, c_context); + VALUE rb_context = noko_xml_sax_parser_context_wrap(rb_class, c_context); + rb_iv_set(rb_context, "@input", rb_input); + + return rb_context; } /* diff --git a/gumbo-parser/src/Makefile b/gumbo-parser/src/Makefile index 6bd4a18fbe..db58c3137f 100644 --- a/gumbo-parser/src/Makefile +++ b/gumbo-parser/src/Makefile @@ -13,9 +13,11 @@ gumbo_objs := \ char_ref.o \ error.o \ foreign_attrs.o \ + hashmap.o \ parser.o \ string_buffer.o \ string_piece.o \ + string_set.o \ svg_attrs.o \ svg_tags.o \ tag.o \ diff --git a/gumbo-parser/src/hashmap.c b/gumbo-parser/src/hashmap.c new file mode 100644 index 0000000000..d70b16277e --- /dev/null +++ b/gumbo-parser/src/hashmap.c @@ -0,0 +1,1154 @@ +// Copyright 2020 Joshua J Baker. All rights reserved. +// Use of this source code is governed by an MIT-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include +#include "hashmap.h" + +#define GROW_AT 0.60 /* 60% */ +#define SHRINK_AT 0.10 /* 10% */ + +#ifndef HASHMAP_LOAD_FACTOR +#define HASHMAP_LOAD_FACTOR GROW_AT +#endif + +static void *(*__malloc)(size_t) = NULL; +static void *(*__realloc)(void *, size_t) = NULL; +static void (*__free)(void *) = NULL; + +// hashmap_set_allocator allows for configuring a custom allocator for +// all hashmap library operations. This function, if needed, should be called +// only once at startup and a prior to calling hashmap_new(). +void hashmap_set_allocator(void *(*malloc)(size_t), void (*free)(void*)) { + __malloc = malloc; + __free = free; +} + +struct bucket { + uint64_t hash:48; + uint64_t dib:16; +}; + +// hashmap is an open addressed hash map using robinhood hashing. +struct hashmap { + void *(*malloc)(size_t); + void *(*realloc)(void *, size_t); + void (*free)(void *); + size_t elsize; + size_t cap; + uint64_t seed0; + uint64_t seed1; + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1); + int (*compare)(const void *a, const void *b, void *udata); + void (*elfree)(void *item); + void *udata; + size_t bucketsz; + size_t nbuckets; + size_t count; + size_t mask; + size_t growat; + size_t shrinkat; + uint8_t loadfactor; + uint8_t growpower; + bool oom; + void *buckets; + void *spare; + void *edata; +}; + +void hashmap_set_grow_by_power(struct hashmap *map, size_t power) { + map->growpower = power < 1 ? 1 : power > 16 ? 16 : power; +} + +static double clamp_load_factor(double factor, double default_factor) { + // Check for NaN and clamp between 50% and 90% + return factor != factor ? default_factor : + factor < 0.50 ? 0.50 : + factor > 0.95 ? 0.95 : + factor; +} + +void hashmap_set_load_factor(struct hashmap *map, double factor) { + factor = clamp_load_factor(factor, map->loadfactor / 100.0); + map->loadfactor = factor * 100; + map->growat = map->nbuckets * (map->loadfactor / 100.0); +} + +static struct bucket *bucket_at0(void *buckets, size_t bucketsz, size_t i) { + return (struct bucket*)(((char*)buckets)+(bucketsz*i)); +} + +static struct bucket *bucket_at(struct hashmap *map, size_t index) { + return bucket_at0(map->buckets, map->bucketsz, index); +} + +static void *bucket_item(struct bucket *entry) { + return ((char*)entry)+sizeof(struct bucket); +} + +static uint64_t clip_hash(uint64_t hash) { + return hash & 0xFFFFFFFFFFFF; +} + +static uint64_t get_hash(struct hashmap *map, const void *key) { + return clip_hash(map->hash(key, map->seed0, map->seed1)); +} + + +// hashmap_new_with_allocator returns a new hash map using a custom allocator. +// See hashmap_new for more information information +struct hashmap *hashmap_new_with_allocator(void *(*_malloc)(size_t), + void *(*_realloc)(void*, size_t), void (*_free)(void*), + size_t elsize, size_t cap, uint64_t seed0, uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata) +{ + _malloc = _malloc ? _malloc : __malloc ? __malloc : malloc; + _realloc = _realloc ? _realloc : __realloc ? __realloc : realloc; + _free = _free ? _free : __free ? __free : free; + size_t ncap = 16; + if (cap < ncap) { + cap = ncap; + } else { + while (ncap < cap) { + ncap *= 2; + } + cap = ncap; + } + size_t bucketsz = sizeof(struct bucket) + elsize; + while (bucketsz & (sizeof(uintptr_t)-1)) { + bucketsz++; + } + // hashmap + spare + edata + size_t size = sizeof(struct hashmap)+bucketsz*2; + struct hashmap *map = _malloc(size); + if (!map) { + return NULL; + } + memset(map, 0, sizeof(struct hashmap)); + map->elsize = elsize; + map->bucketsz = bucketsz; + map->seed0 = seed0; + map->seed1 = seed1; + map->hash = hash; + map->compare = compare; + map->elfree = elfree; + map->udata = udata; + map->spare = ((char*)map)+sizeof(struct hashmap); + map->edata = (char*)map->spare+bucketsz; + map->cap = cap; + map->nbuckets = cap; + map->mask = map->nbuckets-1; + map->buckets = _malloc(map->bucketsz*map->nbuckets); + if (!map->buckets) { + _free(map); + return NULL; + } + memset(map->buckets, 0, map->bucketsz*map->nbuckets); + map->growpower = 1; + map->loadfactor = clamp_load_factor(HASHMAP_LOAD_FACTOR, GROW_AT) * 100; + map->growat = map->nbuckets * (map->loadfactor / 100.0); + map->shrinkat = map->nbuckets * SHRINK_AT; + map->malloc = _malloc; + map->realloc = _realloc; + map->free = _free; + return map; +} + +// hashmap_new returns a new hash map. +// Param `elsize` is the size of each element in the tree. Every element that +// is inserted, deleted, or retrieved will be this size. +// Param `cap` is the default lower capacity of the hashmap. Setting this to +// zero will default to 16. +// Params `seed0` and `seed1` are optional seed values that are passed to the +// following `hash` function. These can be any value you wish but it's often +// best to use randomly generated values. +// Param `hash` is a function that generates a hash value for an item. It's +// important that you provide a good hash function, otherwise it will perform +// poorly or be vulnerable to Denial-of-service attacks. This implementation +// comes with two helper functions `hashmap_sip()` and `hashmap_murmur()`. +// Param `compare` is a function that compares items in the tree. See the +// qsort stdlib function for an example of how this function works. +// The hashmap must be freed with hashmap_free(). +// Param `elfree` is a function that frees a specific item. This should be NULL +// unless you're storing some kind of reference data in the hash. +struct hashmap *hashmap_new(size_t elsize, size_t cap, uint64_t seed0, + uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata) +{ + return hashmap_new_with_allocator(NULL, NULL, NULL, elsize, cap, seed0, + seed1, hash, compare, elfree, udata); +} + +static void free_elements(struct hashmap *map) { + if (map->elfree) { + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib) map->elfree(bucket_item(bucket)); + } + } +} + +// hashmap_clear quickly clears the map. +// Every item is called with the element-freeing function given in hashmap_new, +// if present, to free any data referenced in the elements of the hashmap. +// When the update_cap is provided, the map's capacity will be updated to match +// the currently number of allocated buckets. This is an optimization to ensure +// that this operation does not perform any allocations. +void hashmap_clear(struct hashmap *map, bool update_cap) { + map->count = 0; + free_elements(map); + if (update_cap) { + map->cap = map->nbuckets; + } else if (map->nbuckets != map->cap) { + void *new_buckets = map->malloc(map->bucketsz*map->cap); + if (new_buckets) { + map->free(map->buckets); + map->buckets = new_buckets; + } + map->nbuckets = map->cap; + } + memset(map->buckets, 0, map->bucketsz*map->nbuckets); + map->mask = map->nbuckets-1; + map->growat = map->nbuckets * (map->loadfactor / 100.0) ; + map->shrinkat = map->nbuckets * SHRINK_AT; +} + +static bool resize0(struct hashmap *map, size_t new_cap) { + struct hashmap *map2 = hashmap_new_with_allocator(map->malloc, map->realloc, + map->free, map->elsize, new_cap, map->seed0, map->seed1, map->hash, + map->compare, map->elfree, map->udata); + if (!map2) return false; + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *entry = bucket_at(map, i); + if (!entry->dib) { + continue; + } + entry->dib = 1; + size_t j = entry->hash & map2->mask; + while(1) { + struct bucket *bucket = bucket_at(map2, j); + if (bucket->dib == 0) { + memcpy(bucket, entry, map->bucketsz); + break; + } + if (bucket->dib < entry->dib) { + memcpy(map2->spare, bucket, map->bucketsz); + memcpy(bucket, entry, map->bucketsz); + memcpy(entry, map2->spare, map->bucketsz); + } + j = (j + 1) & map2->mask; + entry->dib += 1; + } + } + map->free(map->buckets); + map->buckets = map2->buckets; + map->nbuckets = map2->nbuckets; + map->mask = map2->mask; + map->growat = map2->growat; + map->shrinkat = map2->shrinkat; + map->free(map2); + return true; +} + +static bool resize(struct hashmap *map, size_t new_cap) { + return resize0(map, new_cap); +} + +// hashmap_set_with_hash works like hashmap_set but you provide your +// own hash. The 'hash' callback provided to the hashmap_new function +// will not be called +const void *hashmap_set_with_hash(struct hashmap *map, const void *item, + uint64_t hash) +{ + hash = clip_hash(hash); + map->oom = false; + if (map->count >= map->growat) { + if (!resize(map, map->nbuckets*(1<growpower))) { + map->oom = true; + return NULL; + } + } + + struct bucket *entry = map->edata; + entry->hash = hash; + entry->dib = 1; + void *eitem = bucket_item(entry); + memcpy(eitem, item, map->elsize); + + void *bitem; + size_t i = entry->hash & map->mask; + while(1) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib == 0) { + memcpy(bucket, entry, map->bucketsz); + map->count++; + return NULL; + } + bitem = bucket_item(bucket); + if (entry->hash == bucket->hash && (!map->compare || + map->compare(eitem, bitem, map->udata) == 0)) + { + memcpy(map->spare, bitem, map->elsize); + memcpy(bitem, eitem, map->elsize); + return map->spare; + } + if (bucket->dib < entry->dib) { + memcpy(map->spare, bucket, map->bucketsz); + memcpy(bucket, entry, map->bucketsz); + memcpy(entry, map->spare, map->bucketsz); + eitem = bucket_item(entry); + } + i = (i + 1) & map->mask; + entry->dib += 1; + } +} + +// hashmap_set inserts or replaces an item in the hash map. If an item is +// replaced then it is returned otherwise NULL is returned. This operation +// may allocate memory. If the system is unable to allocate additional +// memory then NULL is returned and hashmap_oom() returns true. +const void *hashmap_set(struct hashmap *map, const void *item) { + return hashmap_set_with_hash(map, item, get_hash(map, item)); +} + +// hashmap_get_with_hash works like hashmap_get but you provide your +// own hash. The 'hash' callback provided to the hashmap_new function +// will not be called +const void *hashmap_get_with_hash(struct hashmap *map, const void *key, + uint64_t hash) +{ + hash = clip_hash(hash); + size_t i = hash & map->mask; + while(1) { + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) return NULL; + if (bucket->hash == hash) { + void *bitem = bucket_item(bucket); + if (!map->compare || map->compare(key, bitem, map->udata) == 0) { + return bitem; + } + } + i = (i + 1) & map->mask; + } +} + +// hashmap_get returns the item based on the provided key. If the item is not +// found then NULL is returned. +const void *hashmap_get(struct hashmap *map, const void *key) { + return hashmap_get_with_hash(map, key, get_hash(map, key)); +} + +// hashmap_probe returns the item in the bucket at position or NULL if an item +// is not set for that bucket. The position is 'moduloed' by the number of +// buckets in the hashmap. +const void *hashmap_probe(struct hashmap *map, uint64_t position) { + size_t i = position & map->mask; + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) { + return NULL; + } + return bucket_item(bucket); +} + +// hashmap_delete_with_hash works like hashmap_delete but you provide your +// own hash. The 'hash' callback provided to the hashmap_new function +// will not be called +const void *hashmap_delete_with_hash(struct hashmap *map, const void *key, + uint64_t hash) +{ + hash = clip_hash(hash); + map->oom = false; + size_t i = hash & map->mask; + while(1) { + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) { + return NULL; + } + void *bitem = bucket_item(bucket); + if (bucket->hash == hash && (!map->compare || + map->compare(key, bitem, map->udata) == 0)) + { + memcpy(map->spare, bitem, map->elsize); + bucket->dib = 0; + while(1) { + struct bucket *prev = bucket; + i = (i + 1) & map->mask; + bucket = bucket_at(map, i); + if (bucket->dib <= 1) { + prev->dib = 0; + break; + } + memcpy(prev, bucket, map->bucketsz); + prev->dib--; + } + map->count--; + if (map->nbuckets > map->cap && map->count <= map->shrinkat) { + // Ignore the return value. It's ok for the resize operation to + // fail to allocate enough memory because a shrink operation + // does not change the integrity of the data. + resize(map, map->nbuckets/2); + } + return map->spare; + } + i = (i + 1) & map->mask; + } +} + +// hashmap_delete removes an item from the hash map and returns it. If the +// item is not found then NULL is returned. +const void *hashmap_delete(struct hashmap *map, const void *key) { + return hashmap_delete_with_hash(map, key, get_hash(map, key)); +} + +// hashmap_count returns the number of items in the hash map. +size_t hashmap_count(struct hashmap *map) { + return map->count; +} + +// hashmap_free frees the hash map +// Every item is called with the element-freeing function given in hashmap_new, +// if present, to free any data referenced in the elements of the hashmap. +void hashmap_free(struct hashmap *map) { + if (!map) return; + free_elements(map); + map->free(map->buckets); + map->free(map); +} + +// hashmap_oom returns true if the last hashmap_set() call failed due to the +// system being out of memory. +bool hashmap_oom(struct hashmap *map) { + return map->oom; +} + +// hashmap_scan iterates over all items in the hash map +// Param `iter` can return false to stop iteration early. +// Returns false if the iteration has been stopped early. +bool hashmap_scan(struct hashmap *map, + bool (*iter)(const void *item, void *udata), void *udata) +{ + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib && !iter(bucket_item(bucket), udata)) { + return false; + } + } + return true; +} + +// hashmap_iter iterates one key at a time yielding a reference to an +// entry at each iteration. Useful to write simple loops and avoid writing +// dedicated callbacks and udata structures, as in hashmap_scan. +// +// map is a hash map handle. i is a pointer to a size_t cursor that +// should be initialized to 0 at the beginning of the loop. item is a void +// pointer pointer that is populated with the retrieved item. Note that this +// is NOT a copy of the item stored in the hash map and can be directly +// modified. +// +// Note that if hashmap_delete() is called on the hashmap being iterated, +// the buckets are rearranged and the iterator must be reset to 0, otherwise +// unexpected results may be returned after deletion. +// +// This function has not been tested for thread safety. +// +// The function returns true if an item was retrieved; false if the end of the +// iteration has been reached. +bool hashmap_iter(struct hashmap *map, size_t *i, void **item) { + struct bucket *bucket; + do { + if (*i >= map->nbuckets) return false; + bucket = bucket_at(map, *i); + (*i)++; + } while (!bucket->dib); + *item = bucket_item(bucket); + return true; +} + + +//----------------------------------------------------------------------------- +// SipHash reference C implementation +// +// Copyright (c) 2012-2016 Jean-Philippe Aumasson +// +// Copyright (c) 2012-2014 Daniel J. Bernstein +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along +// with this software. If not, see +// . +// +// default: SipHash-2-4 +//----------------------------------------------------------------------------- +static uint64_t SIP64(const uint8_t *in, const size_t inlen, uint64_t seed0, + uint64_t seed1) +{ +#define U8TO64_LE(p) \ + { (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) } +#define U64TO8_LE(p, v) \ + { U32TO8_LE((p), (uint32_t)((v))); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); } +#define U32TO8_LE(p, v) \ + { (p)[0] = (uint8_t)((v)); \ + (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); \ + (p)[3] = (uint8_t)((v) >> 24); } +#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) +#define SIPROUND \ + { v0 += v1; v1 = ROTL(v1, 13); \ + v1 ^= v0; v0 = ROTL(v0, 32); \ + v2 += v3; v3 = ROTL(v3, 16); \ + v3 ^= v2; \ + v0 += v3; v3 = ROTL(v3, 21); \ + v3 ^= v0; \ + v2 += v1; v1 = ROTL(v1, 17); \ + v1 ^= v2; v2 = ROTL(v2, 32); } + uint64_t k0 = U8TO64_LE((uint8_t*)&seed0); + uint64_t k1 = U8TO64_LE((uint8_t*)&seed1); + uint64_t v3 = UINT64_C(0x7465646279746573) ^ k1; + uint64_t v2 = UINT64_C(0x6c7967656e657261) ^ k0; + uint64_t v1 = UINT64_C(0x646f72616e646f6d) ^ k1; + uint64_t v0 = UINT64_C(0x736f6d6570736575) ^ k0; + const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t)); + for (; in != end; in += 8) { + uint64_t m = U8TO64_LE(in); + v3 ^= m; + SIPROUND; SIPROUND; + v0 ^= m; + } + const int left = inlen & 7; + uint64_t b = ((uint64_t)inlen) << 56; + switch (left) { + case 7: b |= ((uint64_t)in[6]) << 48; /* fall through */ + case 6: b |= ((uint64_t)in[5]) << 40; /* fall through */ + case 5: b |= ((uint64_t)in[4]) << 32; /* fall through */ + case 4: b |= ((uint64_t)in[3]) << 24; /* fall through */ + case 3: b |= ((uint64_t)in[2]) << 16; /* fall through */ + case 2: b |= ((uint64_t)in[1]) << 8; /* fall through */ + case 1: b |= ((uint64_t)in[0]); break; + case 0: break; + } + v3 ^= b; + SIPROUND; SIPROUND; + v0 ^= b; + v2 ^= 0xff; + SIPROUND; SIPROUND; SIPROUND; SIPROUND; + b = v0 ^ v1 ^ v2 ^ v3; + uint64_t out = 0; + U64TO8_LE((uint8_t*)&out, b); + return out; +} + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +// +// Murmur3_86_128 +//----------------------------------------------------------------------------- +static uint64_t MM86128(const void *key, const int len, uint32_t seed) { +#define ROTL32(x, r) ((x << r) | (x >> (32 - r))) +#define FMIX32(h) h^=h>>16; h*=0x85ebca6b; h^=h>>13; h*=0xc2b2ae35; h^=h>>16; + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + for (int i = -nblocks; i; i++) { + uint32_t k1 = blocks[i*4+0]; + uint32_t k2 = blocks[i*4+1]; + uint32_t k3 = blocks[i*4+2]; + uint32_t k4 = blocks[i*4+3]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + switch(len & 15) { + case 15: k4 ^= tail[14] << 16; /* fall through */ + case 14: k4 ^= tail[13] << 8; /* fall through */ + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + /* fall through */ + case 12: k3 ^= tail[11] << 24; /* fall through */ + case 11: k3 ^= tail[10] << 16; /* fall through */ + case 10: k3 ^= tail[ 9] << 8; /* fall through */ + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + /* fall through */ + case 8: k2 ^= tail[ 7] << 24; /* fall through */ + case 7: k2 ^= tail[ 6] << 16; /* fall through */ + case 6: k2 ^= tail[ 5] << 8; /* fall through */ + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + /* fall through */ + case 4: k1 ^= tail[ 3] << 24; /* fall through */ + case 3: k1 ^= tail[ 2] << 16; /* fall through */ + case 2: k1 ^= tail[ 1] << 8; /* fall through */ + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + /* fall through */ + }; + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + FMIX32(h1); FMIX32(h2); FMIX32(h3); FMIX32(h4); + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + return (((uint64_t)h2)<<32)|h1; +} + +//----------------------------------------------------------------------------- +// xxHash Library +// Copyright (c) 2012-2021 Yann Collet +// All rights reserved. +// +// BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) +// +// xxHash3 +//----------------------------------------------------------------------------- +#define XXH_PRIME_1 11400714785074694791ULL +#define XXH_PRIME_2 14029467366897019727ULL +#define XXH_PRIME_3 1609587929392839161ULL +#define XXH_PRIME_4 9650029242287828579ULL +#define XXH_PRIME_5 2870177450012600261ULL + +static uint64_t XXH_read64(const void* memptr) { + uint64_t val; + memcpy(&val, memptr, sizeof(val)); + return val; +} + +static uint32_t XXH_read32(const void* memptr) { + uint32_t val; + memcpy(&val, memptr, sizeof(val)); + return val; +} + +static uint64_t XXH_rotl64(uint64_t x, int r) { + return (x << r) | (x >> (64 - r)); +} + +static uint64_t xxh3(const void* data, size_t len, uint64_t seed) { + const uint8_t* p = (const uint8_t*)data; + const uint8_t* const end = p + len; + uint64_t h64; + + if (len >= 32) { + const uint8_t* const limit = end - 32; + uint64_t v1 = seed + XXH_PRIME_1 + XXH_PRIME_2; + uint64_t v2 = seed + XXH_PRIME_2; + uint64_t v3 = seed + 0; + uint64_t v4 = seed - XXH_PRIME_1; + + do { + v1 += XXH_read64(p) * XXH_PRIME_2; + v1 = XXH_rotl64(v1, 31); + v1 *= XXH_PRIME_1; + + v2 += XXH_read64(p + 8) * XXH_PRIME_2; + v2 = XXH_rotl64(v2, 31); + v2 *= XXH_PRIME_1; + + v3 += XXH_read64(p + 16) * XXH_PRIME_2; + v3 = XXH_rotl64(v3, 31); + v3 *= XXH_PRIME_1; + + v4 += XXH_read64(p + 24) * XXH_PRIME_2; + v4 = XXH_rotl64(v4, 31); + v4 *= XXH_PRIME_1; + + p += 32; + } while (p <= limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + + XXH_rotl64(v4, 18); + + v1 *= XXH_PRIME_2; + v1 = XXH_rotl64(v1, 31); + v1 *= XXH_PRIME_1; + h64 ^= v1; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + + v2 *= XXH_PRIME_2; + v2 = XXH_rotl64(v2, 31); + v2 *= XXH_PRIME_1; + h64 ^= v2; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + + v3 *= XXH_PRIME_2; + v3 = XXH_rotl64(v3, 31); + v3 *= XXH_PRIME_1; + h64 ^= v3; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + + v4 *= XXH_PRIME_2; + v4 = XXH_rotl64(v4, 31); + v4 *= XXH_PRIME_1; + h64 ^= v4; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + } + else { + h64 = seed + XXH_PRIME_5; + } + + h64 += (uint64_t)len; + + while (p + 8 <= end) { + uint64_t k1 = XXH_read64(p); + k1 *= XXH_PRIME_2; + k1 = XXH_rotl64(k1, 31); + k1 *= XXH_PRIME_1; + h64 ^= k1; + h64 = XXH_rotl64(h64, 27) * XXH_PRIME_1 + XXH_PRIME_4; + p += 8; + } + + if (p + 4 <= end) { + h64 ^= (uint64_t)(XXH_read32(p)) * XXH_PRIME_1; + h64 = XXH_rotl64(h64, 23) * XXH_PRIME_2 + XXH_PRIME_3; + p += 4; + } + + while (p < end) { + h64 ^= (*p) * XXH_PRIME_5; + h64 = XXH_rotl64(h64, 11) * XXH_PRIME_1; + p++; + } + + h64 ^= h64 >> 33; + h64 *= XXH_PRIME_2; + h64 ^= h64 >> 29; + h64 *= XXH_PRIME_3; + h64 ^= h64 >> 32; + + return h64; +} + +// hashmap_sip returns a hash value for `data` using SipHash-2-4. +uint64_t hashmap_sip(const void *data, size_t len, uint64_t seed0, + uint64_t seed1) +{ + return SIP64((uint8_t*)data, len, seed0, seed1); +} + +// hashmap_murmur returns a hash value for `data` using Murmur3_86_128. +uint64_t hashmap_murmur(const void *data, size_t len, uint64_t seed0, + uint64_t seed1) +{ + (void)seed1; + return MM86128(data, len, seed0); +} + +uint64_t hashmap_xxhash3(const void *data, size_t len, uint64_t seed0, + uint64_t seed1) +{ + (void)seed1; + return xxh3(data, len ,seed0); +} + +//============================================================================== +// TESTS AND BENCHMARKS +// $ cc -DHASHMAP_TEST hashmap.c && ./a.out # run tests +// $ cc -DHASHMAP_TEST -O3 hashmap.c && BENCH=1 ./a.out # run benchmarks +//============================================================================== +#ifdef HASHMAP_TEST + +static size_t deepcount(struct hashmap *map) { + size_t count = 0; + for (size_t i = 0; i < map->nbuckets; i++) { + if (bucket_at(map, i)->dib) { + count++; + } + } + return count; +} + +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#ifdef __clang__ +#pragma GCC diagnostic ignored "-Wunknown-warning-option" +#pragma GCC diagnostic ignored "-Wcompound-token-split-by-macro" +#pragma GCC diagnostic ignored "-Wgnu-statement-expression-from-macro-expansion" +#endif +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +#include +#include +#include +#include +#include +#include "hashmap.h" + +static bool rand_alloc_fail = false; +static int rand_alloc_fail_odds = 3; // 1 in 3 chance malloc will fail. +static uintptr_t total_allocs = 0; +static uintptr_t total_mem = 0; + +static void *xmalloc(size_t size) { + if (rand_alloc_fail && rand()%rand_alloc_fail_odds == 0) { + return NULL; + } + void *mem = malloc(sizeof(uintptr_t)+size); + assert(mem); + *(uintptr_t*)mem = size; + total_allocs++; + total_mem += size; + return (char*)mem+sizeof(uintptr_t); +} + +static void xfree(void *ptr) { + if (ptr) { + total_mem -= *(uintptr_t*)((char*)ptr-sizeof(uintptr_t)); + free((char*)ptr-sizeof(uintptr_t)); + total_allocs--; + } +} + +static void shuffle(void *array, size_t numels, size_t elsize) { + char tmp[elsize]; + char *arr = array; + for (size_t i = 0; i < numels - 1; i++) { + int j = i + rand() / (RAND_MAX / (numels - i) + 1); + memcpy(tmp, arr + j * elsize, elsize); + memcpy(arr + j * elsize, arr + i * elsize, elsize); + memcpy(arr + i * elsize, tmp, elsize); + } +} + +static bool iter_ints(const void *item, void *udata) { + int *vals = *(int**)udata; + vals[*(int*)item] = 1; + return true; +} + +static int compare_ints_udata(const void *a, const void *b, void *udata) { + return *(int*)a - *(int*)b; +} + +static int compare_strs(const void *a, const void *b, void *udata) { + return strcmp(*(char**)a, *(char**)b); +} + +static uint64_t hash_int(const void *item, uint64_t seed0, uint64_t seed1) { + return hashmap_xxhash3(item, sizeof(int), seed0, seed1); + // return hashmap_sip(item, sizeof(int), seed0, seed1); + // return hashmap_murmur(item, sizeof(int), seed0, seed1); +} + +static uint64_t hash_str(const void *item, uint64_t seed0, uint64_t seed1) { + return hashmap_xxhash3(*(char**)item, strlen(*(char**)item), seed0, seed1); + // return hashmap_sip(*(char**)item, strlen(*(char**)item), seed0, seed1); + // return hashmap_murmur(*(char**)item, strlen(*(char**)item), seed0, seed1); +} + +static void free_str(void *item) { + xfree(*(char**)item); +} + +static void all(void) { + int seed = getenv("SEED")?atoi(getenv("SEED")):time(NULL); + int N = getenv("N")?atoi(getenv("N")):2000; + printf("seed=%d, count=%d, item_size=%zu\n", seed, N, sizeof(int)); + srand(seed); + + rand_alloc_fail = true; + + // test sip and murmur hashes + assert(hashmap_sip("hello", 5, 1, 2) == 2957200328589801622); + assert(hashmap_murmur("hello", 5, 1, 2) == 1682575153221130884); + assert(hashmap_xxhash3("hello", 5, 1, 2) == 2584346877953614258); + + int *vals; + while (!(vals = xmalloc(N * sizeof(int)))) {} + for (int i = 0; i < N; i++) { + vals[i] = i; + } + + struct hashmap *map; + + while (!(map = hashmap_new(sizeof(int), 0, seed, seed, + hash_int, compare_ints_udata, NULL, NULL))) {} + shuffle(vals, N, sizeof(int)); + for (int i = 0; i < N; i++) { + // // printf("== %d ==\n", vals[i]); + assert(map->count == (size_t)i); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + const int *v; + assert(!hashmap_get(map, &vals[i])); + assert(!hashmap_delete(map, &vals[i])); + while (true) { + assert(!hashmap_set(map, &vals[i])); + if (!hashmap_oom(map)) { + break; + } + } + + for (int j = 0; j < i; j++) { + v = hashmap_get(map, &vals[j]); + assert(v && *v == vals[j]); + } + while (true) { + v = hashmap_set(map, &vals[i]); + if (!v) { + assert(hashmap_oom(map)); + continue; + } else { + assert(!hashmap_oom(map)); + assert(v && *v == vals[i]); + break; + } + } + v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + assert(!hashmap_get(map, &vals[i])); + assert(!hashmap_delete(map, &vals[i])); + assert(!hashmap_set(map, &vals[i])); + assert(map->count == (size_t)(i+1)); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + } + + int *vals2; + while (!(vals2 = xmalloc(N * sizeof(int)))) {} + memset(vals2, 0, N * sizeof(int)); + assert(hashmap_scan(map, iter_ints, &vals2)); + + // Test hashmap_iter. This does the same as hashmap_scan above. + size_t iter = 0; + void *iter_val; + while (hashmap_iter (map, &iter, &iter_val)) { + assert (iter_ints(iter_val, &vals2)); + } + for (int i = 0; i < N; i++) { + assert(vals2[i] == 1); + } + xfree(vals2); + + shuffle(vals, N, sizeof(int)); + for (int i = 0; i < N; i++) { + const int *v; + v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + assert(!hashmap_get(map, &vals[i])); + assert(map->count == (size_t)(N-i-1)); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + for (int j = N-1; j > i; j--) { + v = hashmap_get(map, &vals[j]); + assert(v && *v == vals[j]); + } + } + + for (int i = 0; i < N; i++) { + while (true) { + assert(!hashmap_set(map, &vals[i])); + if (!hashmap_oom(map)) { + break; + } + } + } + + assert(map->count != 0); + size_t prev_cap = map->cap; + hashmap_clear(map, true); + assert(prev_cap < map->cap); + assert(map->count == 0); + + + for (int i = 0; i < N; i++) { + while (true) { + assert(!hashmap_set(map, &vals[i])); + if (!hashmap_oom(map)) { + break; + } + } + } + + prev_cap = map->cap; + hashmap_clear(map, false); + assert(prev_cap == map->cap); + + hashmap_free(map); + + xfree(vals); + + + while (!(map = hashmap_new(sizeof(char*), 0, seed, seed, + hash_str, compare_strs, free_str, NULL))); + + for (int i = 0; i < N; i++) { + char *str; + while (!(str = xmalloc(16))); + snprintf(str, 16, "s%i", i); + while(!hashmap_set(map, &str)); + } + + hashmap_clear(map, false); + assert(hashmap_count(map) == 0); + + for (int i = 0; i < N; i++) { + char *str; + while (!(str = xmalloc(16))); + snprintf(str, 16, "s%i", i); + while(!hashmap_set(map, &str)); + } + + hashmap_free(map); + + if (total_allocs != 0) { + fprintf(stderr, "total_allocs: expected 0, got %lu\n", total_allocs); + exit(1); + } +} + +#define bench(name, N, code) {{ \ + if (strlen(name) > 0) { \ + printf("%-14s ", name); \ + } \ + size_t tmem = total_mem; \ + size_t tallocs = total_allocs; \ + uint64_t bytes = 0; \ + clock_t begin = clock(); \ + for (int i = 0; i < N; i++) { \ + (code); \ + } \ + clock_t end = clock(); \ + double elapsed_secs = (double)(end - begin) / CLOCKS_PER_SEC; \ + double bytes_sec = (double)bytes/elapsed_secs; \ + printf("%d ops in %.3f secs, %.0f ns/op, %.0f op/sec", \ + N, elapsed_secs, \ + elapsed_secs/(double)N*1e9, \ + (double)N/elapsed_secs \ + ); \ + if (bytes > 0) { \ + printf(", %.1f GB/sec", bytes_sec/1024/1024/1024); \ + } \ + if (total_mem > tmem) { \ + size_t used_mem = total_mem-tmem; \ + printf(", %.2f bytes/op", (double)used_mem/N); \ + } \ + if (total_allocs > tallocs) { \ + size_t used_allocs = total_allocs-tallocs; \ + printf(", %.2f allocs/op", (double)used_allocs/N); \ + } \ + printf("\n"); \ +}} + +static void benchmarks(void) { + int seed = getenv("SEED")?atoi(getenv("SEED")):time(NULL); + int N = getenv("N")?atoi(getenv("N")):5000000; + printf("seed=%d, count=%d, item_size=%zu\n", seed, N, sizeof(int)); + srand(seed); + + + int *vals = xmalloc(N * sizeof(int)); + for (int i = 0; i < N; i++) { + vals[i] = i; + } + + shuffle(vals, N, sizeof(int)); + + struct hashmap *map; + shuffle(vals, N, sizeof(int)); + + map = hashmap_new(sizeof(int), 0, seed, seed, hash_int, compare_ints_udata, + NULL, NULL); + bench("set", N, { + const int *v = hashmap_set(map, &vals[i]); + assert(!v); + }) + shuffle(vals, N, sizeof(int)); + bench("get", N, { + const int *v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + }) + shuffle(vals, N, sizeof(int)); + bench("delete", N, { + const int *v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + }) + hashmap_free(map); + + map = hashmap_new(sizeof(int), N, seed, seed, hash_int, compare_ints_udata, + NULL, NULL); + bench("set (cap)", N, { + const int *v = hashmap_set(map, &vals[i]); + assert(!v); + }) + shuffle(vals, N, sizeof(int)); + bench("get (cap)", N, { + const int *v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + }) + shuffle(vals, N, sizeof(int)); + bench("delete (cap)" , N, { + const int *v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + }) + + hashmap_free(map); + + + xfree(vals); + + if (total_allocs != 0) { + fprintf(stderr, "total_allocs: expected 0, got %lu\n", total_allocs); + exit(1); + } +} + +int main(void) { + hashmap_set_allocator(xmalloc, xfree); + + if (getenv("BENCH")) { + printf("Running hashmap.c benchmarks...\n"); + benchmarks(); + } else { + printf("Running hashmap.c tests...\n"); + all(); + printf("PASSED\n"); + } +} + + +#endif + + + diff --git a/gumbo-parser/src/hashmap.h b/gumbo-parser/src/hashmap.h new file mode 100644 index 0000000000..e22990e045 --- /dev/null +++ b/gumbo-parser/src/hashmap.h @@ -0,0 +1,62 @@ +// Copyright 2020 Joshua J Baker. All rights reserved. +// Use of this source code is governed by an MIT-style +// license that can be found in the LICENSE file. + +#ifndef HASHMAP_H +#define HASHMAP_H + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif // __cplusplus + +struct hashmap; + +struct hashmap *hashmap_new(size_t elsize, size_t cap, uint64_t seed0, + uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata); + +struct hashmap *hashmap_new_with_allocator(void *(*malloc)(size_t), + void *(*realloc)(void *, size_t), void (*free)(void*), size_t elsize, + size_t cap, uint64_t seed0, uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata); + +void hashmap_free(struct hashmap *map); +void hashmap_clear(struct hashmap *map, bool update_cap); +size_t hashmap_count(struct hashmap *map); +bool hashmap_oom(struct hashmap *map); +const void *hashmap_get(struct hashmap *map, const void *item); +const void *hashmap_set(struct hashmap *map, const void *item); +const void *hashmap_delete(struct hashmap *map, const void *item); +const void *hashmap_probe(struct hashmap *map, uint64_t position); +bool hashmap_scan(struct hashmap *map, bool (*iter)(const void *item, void *udata), void *udata); +bool hashmap_iter(struct hashmap *map, size_t *i, void **item); + +uint64_t hashmap_sip(const void *data, size_t len, uint64_t seed0, uint64_t seed1); +uint64_t hashmap_murmur(const void *data, size_t len, uint64_t seed0, uint64_t seed1); +uint64_t hashmap_xxhash3(const void *data, size_t len, uint64_t seed0, uint64_t seed1); + +const void *hashmap_get_with_hash(struct hashmap *map, const void *key, uint64_t hash); +const void *hashmap_delete_with_hash(struct hashmap *map, const void *key, uint64_t hash); +const void *hashmap_set_with_hash(struct hashmap *map, const void *item, uint64_t hash); +void hashmap_set_grow_by_power(struct hashmap *map, size_t power); +void hashmap_set_load_factor(struct hashmap *map, double load_factor); + + +// DEPRECATED: use `hashmap_new_with_allocator` +void hashmap_set_allocator(void *(*malloc)(size_t), void (*free)(void*)); + +#if defined(__cplusplus) +} +#endif // __cplusplus + +#endif // HASHMAP_H diff --git a/gumbo-parser/src/nokogiri_gumbo.h b/gumbo-parser/src/nokogiri_gumbo.h index 69555607ff..cdec31674b 100644 --- a/gumbo-parser/src/nokogiri_gumbo.h +++ b/gumbo-parser/src/nokogiri_gumbo.h @@ -318,21 +318,6 @@ const char* gumbo_normalized_tagname(GumboTag tag); */ void gumbo_tag_from_original_text(GumboStringPiece* text); -/** - * Fixes the case of SVG elements that are not all lowercase. This is - * not done at parse time because there's no place to store a mutated - * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most - * SVG tags without special handling), while `original_tag_name` is a - * pointer into the original buffer. Instead, we provide this helper - * function that clients can use to rename SVG tags as appropriate. - * Returns the case-normalized SVG tagname if a replacement is found, or - * `NULL` if no normalization is called for. The return value is static - * data and owned by the library. - * - * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign - */ -const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); - /** * Converts a tag name string (which may be in upper or mixed case) to a * tag enum. diff --git a/gumbo-parser/src/parser.c b/gumbo-parser/src/parser.c index 96e6f2bdbe..6498b4850c 100644 --- a/gumbo-parser/src/parser.c +++ b/gumbo-parser/src/parser.c @@ -1962,14 +1962,6 @@ static void merge_attributes ( #endif } -const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) { - const StringReplacement *replacement = gumbo_get_svg_tag_replacement ( - tag->data, - tag->length - ); - return replacement ? replacement->to : NULL; -} - // https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes // This destructively modifies any matching attributes on the token and sets the // namespace appropriately. diff --git a/gumbo-parser/src/string_set.c b/gumbo-parser/src/string_set.c new file mode 100644 index 0000000000..e296a02c46 --- /dev/null +++ b/gumbo-parser/src/string_set.c @@ -0,0 +1,41 @@ +#include "string_set.h" + +#include +#include "hashmap.h" + +#define SEED0 0xf00ba2 +#define SEED1 0xfa1afe1 + +static int +string_compare(const void *a, const void *b, void *udata) { + return strcmp((const char *)a, (const char *)b); +} + +static uint64_t +string_hash(const void *item, uint64_t seed0, uint64_t seed1) { + const char *str = (const char *)item; + return hashmap_xxhash3(str, strlen(str), seed0, seed1); +} + +GumboStringSet * +gumbo_string_set_new(size_t cap) +{ + return hashmap_new(sizeof(char *), cap, SEED0, SEED1, string_hash, string_compare, NULL, NULL); +} + +void gumbo_string_set_free(GumboStringSet *set) +{ + hashmap_free(set); +} + +void +gumbo_string_set_insert(GumboStringSet *set, const char *str) +{ + hashmap_set(set, str); +} + +int +gumbo_string_set_contains(GumboStringSet *set, const char *str) +{ + return hashmap_get(set, str) == NULL ? 0 : 1; +} diff --git a/gumbo-parser/src/string_set.h b/gumbo-parser/src/string_set.h new file mode 100644 index 0000000000..8fd544d17f --- /dev/null +++ b/gumbo-parser/src/string_set.h @@ -0,0 +1,21 @@ +#ifndef STRING_SET_H +#define STRING_SET_H + +#include + +#if defined(__cplusplus) +extern "C" { +#endif // __cplusplus + +typedef struct hashmap GumboStringSet; + +GumboStringSet* gumbo_string_set_new(size_t cap); +void gumbo_string_set_free(GumboStringSet *set); +void gumbo_string_set_insert(GumboStringSet *set, const char *str); +int gumbo_string_set_contains(GumboStringSet *set, const char *str); + +#if defined(__cplusplus) +} +#endif // __cplusplus + +#endif // STRING_SET_H diff --git a/gumbo-parser/src/tokenizer.c b/gumbo-parser/src/tokenizer.c index bc34181351..e82dd873a4 100644 --- a/gumbo-parser/src/tokenizer.c +++ b/gumbo-parser/src/tokenizer.c @@ -58,6 +58,10 @@ #include "utf8.h" #include "util.h" #include "vector.h" +#include "string_set.h" + +// Tuned this based on benchmark in https://github.com/sparklemotion/nokogiri/issues/2568 +#define GUMBO_ATTRIBUTES_LOOKUP_MIN_SIZE 16 // Compared against _temporary_buffer to determine if we're in // double-escaped script mode. @@ -99,6 +103,7 @@ typedef struct GumboInternalTagState { // attributes are added as soon as their attribute name state is complete, and // values are filled in by operating on _attributes.data[attributes.length-1]. GumboVector /* GumboAttribute */ _attributes; + GumboStringSet* _attributes_lookup; // If true, the next attribute value to be finished should be dropped. This // happens if a duplicate attribute name is encountered - we want to consume @@ -440,11 +445,9 @@ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) { return EMIT_TOKEN; } -// Debug-only function that explicitly sets the attribute vector data to NULL so -// it can be asserted on tag creation, verifying that there are no memory leaks. static void mark_tag_state_as_empty(GumboTagState* tag_state) { - UNUSED_IF_NDEBUG(tag_state); tag_state->_name = NULL; + tag_state->_attributes_lookup = NULL; #ifndef NDEBUG tag_state->_attributes = kGumboEmptyVector; #endif @@ -461,6 +464,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { output->v.start_tag.attributes = tag_state->_attributes; output->v.start_tag.is_self_closing = tag_state->_is_self_closing; tag_state->_last_start_tag = tag_state->_tag; + gumbo_string_set_free(tag_state->_attributes_lookup); mark_tag_state_as_empty(tag_state); gumbo_debug( "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); @@ -480,6 +484,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { gumbo_destroy_attribute(tag_state->_attributes.data[i]); } gumbo_free(tag_state->_attributes.data); + gumbo_string_set_free(tag_state->_attributes_lookup); mark_tag_state_as_empty(tag_state); gumbo_debug( "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); @@ -508,6 +513,7 @@ static void abandon_current_tag(GumboParser* parser) { } gumbo_free(tag_state->_name); gumbo_free(tag_state->_attributes.data); + gumbo_string_set_free(tag_state->_attributes_lookup); mark_tag_state_as_empty(tag_state); gumbo_string_buffer_destroy(&tag_state->_buffer); gumbo_debug("Abandoning current tag.\n"); @@ -786,6 +792,8 @@ static void finish_attribute_name(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; GumboTagState* tag_state = &tokenizer->_tag_state; GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes; + GumboStringSet* attributes_lookup = tag_state->_attributes_lookup; + char* attr_name = NULL; int max_attributes = parser->_options->max_attributes; if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) { @@ -796,32 +804,42 @@ static void finish_attribute_name(GumboParser* parser) { return; } + if (attributes->length >= GUMBO_ATTRIBUTES_LOOKUP_MIN_SIZE && tag_state->_attributes_lookup == NULL) { + // build the hash table of attributes + attributes_lookup = tag_state->_attributes_lookup = gumbo_string_set_new(GUMBO_ATTRIBUTES_LOOKUP_MIN_SIZE * 2); + for (unsigned int i = 0; i < attributes->length; ++i) { + GumboAttribute* attr = attributes->data[i]; + gumbo_string_set_insert(attributes_lookup, attr->name); + } + } + // May've been set by a previous attribute without a value; reset it here. tag_state->_drop_next_attr_value = false; assert(tag_state->_attributes.data); assert(tag_state->_attributes.capacity); - for (unsigned int i = 0; i < attributes->length; ++i) { - GumboAttribute* attr = attributes->data[i]; - if ( - strlen(attr->name) == tag_state->_buffer.length - && 0 == memcmp ( - attr->name, - tag_state->_buffer.data, - tag_state->_buffer.length - ) - ) { - // Identical attribute; bail. - add_duplicate_attr_error(parser); - reinitialize_tag_buffer(parser); - tag_state->_drop_next_attr_value = true; - return; + if (!attributes_lookup) { + for (unsigned int i = 0; i < attributes->length; ++i) { + GumboAttribute* attr = attributes->data[i]; + if (strlen(attr->name) == tag_state->_buffer.length + && 0 == memcmp(attr->name, tag_state->_buffer.data, tag_state->_buffer.length)) { + goto duplicate_attribute; + } + } + } else { + attr_name = gumbo_string_buffer_to_string(&tag_state->_buffer); + if (gumbo_string_set_contains(attributes_lookup, attr_name)) { + goto duplicate_attribute; } } GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute)); attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; - copy_over_tag_buffer(parser, &attr->name); + if (attr_name) { + attr->name = attr_name; + } else { + copy_over_tag_buffer(parser, &attr->name); + } copy_over_original_tag_text ( parser, &attr->original_name, @@ -836,7 +854,19 @@ static void finish_attribute_name(GumboParser* parser) { &attr->name_end ); gumbo_vector_add(attr, attributes); + if (attributes_lookup) { + gumbo_string_set_insert(attributes_lookup, attr->name); + } + reinitialize_tag_buffer(parser); + return; + +duplicate_attribute: + // Identical attribute; bail. + gumbo_free(attr_name); + add_duplicate_attr_error(parser); reinitialize_tag_buffer(parser); + tag_state->_drop_next_attr_value = true; + return; } // Finishes an attribute value. This sets the value of the most recently added diff --git a/gumbo-parser/test/tokenizer.cc b/gumbo-parser/test/tokenizer.cc index 9cde694cf0..139dda10f2 100644 --- a/gumbo-parser/test/tokenizer.cc +++ b/gumbo-parser/test/tokenizer.cc @@ -4684,6 +4684,42 @@ TEST_F(GumboTokenizerTest, Data_MultipleAttributes) { NextChar('z'); } +TEST_F(GumboTokenizerTest, Data_DuplicateAttributes) { + SetInput(""); + NextStartTag(GUMBO_TAG_SPAN, true); + + Error(GUMBO_ERR_DUPLICATE_ATTRIBUTE); + ASSERT_EQ(3, token_.v.start_tag.attributes.length); + + GumboAttribute *attr = static_cast(token_.v.start_tag.attributes.data[0]); + EXPECT_STREQ("foo", attr->name); + EXPECT_STREQ("123", attr->value); + + attr = static_cast(token_.v.start_tag.attributes.data[1]); + EXPECT_STREQ("bar", attr->name); + EXPECT_STREQ("456", attr->value); + + attr = static_cast(token_.v.start_tag.attributes.data[2]); + EXPECT_STREQ("baz", attr->name); + EXPECT_STREQ("abc", attr->value); +} + +TEST_F(GumboTokenizerTest, Data_DuplicateAttributesWithHashtable) { + SetInput(""); + NextStartTag(GUMBO_TAG_SPAN, true); + + Error(GUMBO_ERR_DUPLICATE_ATTRIBUTE); + ASSERT_EQ(26, token_.v.start_tag.attributes.length); + + GumboAttribute *attr = static_cast(token_.v.start_tag.attributes.data[0]); + EXPECT_STREQ("a", attr->name); + EXPECT_STREQ("1", attr->value); + + attr = static_cast(token_.v.start_tag.attributes.data[25]); + EXPECT_STREQ("z", attr->name); + EXPECT_STREQ("1", attr->value); +} + TEST_F(GumboTokenizerTest, Data_LT_Alpha_Slash_GT) { SetInput("
    z"); NextStartTag(GUMBO_TAG_BR); diff --git a/lib/nokogiri/css/xpath_visitor.rb b/lib/nokogiri/css/xpath_visitor.rb index dd228cf070..4032d814ed 100644 --- a/lib/nokogiri/css/xpath_visitor.rb +++ b/lib/nokogiri/css/xpath_visitor.rb @@ -283,7 +283,8 @@ def visit_element_name(node) else node.value.join(":") end - elsif @namespaces&.key?("xmlns") # apply the default namespace if it's declared + elsif node.value.first != "*" && @namespaces&.key?("xmlns") + # apply the default namespace (if one is present) to a non-wildcard selector "xmlns:#{node.value.first}" else node.value.first diff --git a/lib/nokogiri/extension.rb b/lib/nokogiri/extension.rb index 47492bc786..f4ff04abd0 100644 --- a/lib/nokogiri/extension.rb +++ b/lib/nokogiri/extension.rb @@ -16,8 +16,8 @@ If that's the case, then please install Nokogiri via the `ruby` platform gem: gem install nokogiri --platform=ruby - or: - bundle config set force_ruby_platform true + or, in your Gemfile: + gem "nokogiri", force_ruby_platform: true Please visit https://nokogiri.org/tutorials/installing_nokogiri.html for more help. diff --git a/lib/nokogiri/html4/element_description_defaults.rb b/lib/nokogiri/html4/element_description_defaults.rb index 79a8b81064..948323b5f4 100644 --- a/lib/nokogiri/html4/element_description_defaults.rb +++ b/lib/nokogiri/html4/element_description_defaults.rb @@ -9,7 +9,6 @@ class ElementDescription # defined there. Desc = Struct.new( - "HTMLElementDescription", :name, :startTag, :endTag, diff --git a/lib/nokogiri/html4/encoding_reader.rb b/lib/nokogiri/html4/encoding_reader.rb index ce8069adcf..aec2676332 100644 --- a/lib/nokogiri/html4/encoding_reader.rb +++ b/lib/nokogiri/html4/encoding_reader.rb @@ -108,7 +108,7 @@ def read(len) ret = @firstchunk.slice!(0, len) if (len -= ret.length) > 0 - (rest = @io.read(len)) && ret << (rest) + (rest = @io.read(len)) && ret << rest end if ret.empty? nil diff --git a/lib/nokogiri/version/constant.rb b/lib/nokogiri/version/constant.rb index 8a6fe62f17..11b5a4b83d 100644 --- a/lib/nokogiri/version/constant.rb +++ b/lib/nokogiri/version/constant.rb @@ -2,5 +2,5 @@ module Nokogiri # The version of Nokogiri you are using - VERSION = "1.18.0" + VERSION = "1.19.0.dev" end diff --git a/lib/nokogiri/xml/builder.rb b/lib/nokogiri/xml/builder.rb index 617ef45e73..eb05f22135 100644 --- a/lib/nokogiri/xml/builder.rb +++ b/lib/nokogiri/xml/builder.rb @@ -200,7 +200,7 @@ module XML # === Namespace inheritance # # In the Builder context, children will inherit their parent's namespace. This is the same - # behavior as if the underlying {XML::Document} set +namespace_inheritance+ to +true+: + # behavior as if the underlying XML::Document set +namespace_inheritance+ to +true+: # # result = Nokogiri::XML::Builder.new do |xml| # xml["soapenv"].Envelope("xmlns:soapenv" => "http://schemas.xmlsoap.org/soap/envelope/") do @@ -229,7 +229,7 @@ module XML # # # # # - # For more information on namespace inheritance, please see {XML::Document#namespace_inheritance} + # For more information on namespace inheritance, please see XML::Document#namespace_inheritance # # # == Document Types @@ -314,7 +314,7 @@ def initialize(options = {}, root = nil, &block) @context = nil @arity = nil - @ns = nil + @ns_prefix = nil options = DEFAULT_DOCUMENT_OPTIONS.merge(options) options.each do |k, v| @@ -355,20 +355,8 @@ def comment(string) ### # Build a tag that is associated with namespace +ns+. Raises an # ArgumentError if +ns+ has not been defined higher in the tree. - def [](ns) - if @parent != @doc - @ns = @parent.namespace_definitions.find { |x| x.prefix == ns.to_s } - end - return self if @ns - - @parent.ancestors.each do |a| - next if a == doc - - @ns = a.namespace_definitions.find { |x| x.prefix == ns.to_s } - return self if @ns - end - - @ns = { pending: ns.to_s } + def [](ns_prefix) + @ns_prefix = ns_prefix.to_s self end @@ -395,29 +383,33 @@ def method_missing(method, *args, &block) # :nodoc: if @context&.respond_to?(method) @context.send(method, *args, &block) else - node = @doc.create_element(method.to_s.sub(/[_!]$/, ""), *args) do |n| - # Set up the namespace - if @ns.is_a?(Nokogiri::XML::Namespace) - n.namespace = @ns - @ns = nil - end - end - - if @ns.is_a?(Hash) - node.namespace = node.namespace_definitions.find { |x| x.prefix == @ns[:pending] } - if node.namespace.nil? - raise ArgumentError, "Namespace #{@ns[:pending]} has not been defined" - end - - @ns = nil - end - + node = @doc.create_element(method.to_s.sub(/[_!]$/, ""), *args) + bind_ns(node) insert(node, &block) end end private + def bind_ns(node) + return if @ns_prefix.nil? + + ancestors = [node, parent, parent.ancestors].flatten + ancestors.each do |ancestor| + break if ancestor.nil? || ancestor == @doc + + if (ns = ancestor.namespace_definitions.find { |x| x.prefix == @ns_prefix }) + @ns_prefix = nil + node.namespace = ns + break + end + end + + return if @ns_prefix.nil? + + raise ArgumentError, "Namespace prefix #{@ns_prefix.inspect} has not been defined" + end + ### # Insert +node+ as a child of the current Node def insert(node, &block) diff --git a/lib/nokogiri/xml/document.rb b/lib/nokogiri/xml/document.rb index 6c9d4949a1..e4446f3d30 100644 --- a/lib/nokogiri/xml/document.rb +++ b/lib/nokogiri/xml/document.rb @@ -435,7 +435,7 @@ def fragment(tags = nil) undef_method :namespace_definitions, :line, :add_namespace def add_child(node_or_tags) - raise "A document may not have multiple root nodes." if (root && root.name != "nokogiri_text_wrapper") && !(node_or_tags.comment? || node_or_tags.processing_instruction?) + raise "A document may not have multiple root nodes." if root && root.name != "nokogiri_text_wrapper" && !(node_or_tags.comment? || node_or_tags.processing_instruction?) node_or_tags = coerce(node_or_tags) if node_or_tags.is_a?(XML::NodeSet) @@ -502,10 +502,10 @@ def deconstruct_keys(keys) { root: root } end - private - IMPLIED_XPATH_CONTEXTS = ["//"].freeze # :nodoc: + private + def inspect_attributes [:name, :children] end diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index 2c9d7f12d2..02e802d4df 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -5,9 +5,41 @@ module Nokogiri module XML - # Nokogiri::XML::Node is the primary API you'll use to interact with your Document. + # \Class XML::Node defines many methods and constants + # that are important for working with an \XML document. # - # == Attributes + # XML::Node itself includes modules that add still more methods and constants: + # + # - Nokogiri::XML::Searchable + # - Ruby core's {Enumerable}[https://docs.ruby-lang.org/en/master/Enumerable.html] + # + # == Node Hierarchy + # + # Each of the following classes is, directly or indirectly, a subclass of XML::Node, + # and so inherits all the methods and constants mentioned above: + # + # - Nokogiri::XML::Attr + # - Nokogiri::XML::AttributeDecl + # - Nokogiri::XML::CharacterData + # - Nokogiri::XML::Comment + # - Nokogiri::XML::Text + # - Nokogiri::XML::CDATA + # - Nokogiri::XML::DTD + # - Nokogiri::XML::Document + # - Nokogiri::XML::DocumentFragment + # - Nokogiri::XML::Element + # - Nokogiri::XML::ElementDecl + # - Nokogiri::XML::EntityDecl + # - Nokogiri::XML::EntityReference + # - Nokogiri::XML::ProcessingInstruction + # + # == About the Examples + # + # Examples on this page may assume that certain setup code has been executed. + # + # :include: doc/examples/bookstore_setup.rb + # + # # == Attributes # # A Nokogiri::XML::Node may be treated similarly to a hash with regard to attributes. For # example: @@ -171,16 +203,109 @@ def decorate! # :section: Manipulating Document Structure - ### - # Add +node_or_tags+ as a child of this Node. + # :call-seq: + # add_child(object) -> Node or NodeSet # - # +node_or_tags+ can be a Nokogiri::XML::Node, a ::DocumentFragment, a ::NodeSet, or a String - # containing markup. + # Appends specified Nodes to the children of +self+; + # each appended Node has +self+ as its #parent value, + # and self.document as its #document value. # - # Returns the reparented node (if +node_or_tags+ is a Node), or NodeSet (if +node_or_tags+ is - # a DocumentFragment, NodeSet, or String). + # [Arguments] + # + # - +object+ (Node, NodeSet, DocumentFragment, String): + # Specifies the Node objects to be appended; + # the Nodes may be in the same Document or DocumentFragment as +self+, + # or in a different one. + # + # [Returns] + # + # - The given +object+, if +object+ is a Node or a NodeSet + # - A new NodeSet, if +object+ is a DocumentFragment or String. + # + # When +object+ is a Node, + # moves it to become the last child of +self+; + # returns +object+: + # + # src_xml = '' + # src_doc = Nokogiri::XML::Document.parse(src_xml) + # src_parent_node = src_doc.at_xpath('//src_parent') + # dst_xml = '' + # dst_doc = Nokogiri::XML::Document.parse(dst_xml) + # dst_parent_node = dst_doc.root + # node_to_move = src_doc.at_xpath('//src_child') + # # Before. + # src_parent_node.children.map {|child| child.name } # => ["src_child"] + # dst_parent_node.children.map {|child| child.name } # => ["dst_parent"] + # node_to_move.parent.name # => "src_parent" + # # Move the node. + # dst_parent_node.add_child(node_to_move) + # # After. + # src_parent_node.children.map {|child| child.name } # => [] + # dst_parent_node.children.map {|child| child.name } # => ["dst_parent", "src_child"] + # node_to_move.parent.name # => "dst_root" + # + # When +object+ is a NodeSet, + # appends each of its nodes to the children of +self+; + # returns +object+: + # + # src_xml = '' + # src_doc = Nokogiri::XML::Document.parse(src_xml) + # nodeset_to_move = src_doc.root.children + # nodeset_to_move.class # => Nokogiri::XML::NodeSet + # dst_doc = Nokogiri::XML::Document.parse('') + # dst_node = dst_doc.root + # # Before. + # nodeset_to_move.map {|node| node.name } # => ["foo", "bar"] + # nodeset_to_move.map {|node| node.parent.name } # => ["src_root", "src_root"] + # dst_node.children.map {|child| child.name } # => ["baz"] + # # Move the nodeset. + # dst_node.add_child(nodeset_to_move) + # # After. + # nodeset_to_move.map {|node| node.name } # => ["foo", "bar"] + # nodeset_to_move.map {|node| node.parent.name } # => ["dst_root", "dst_root"] + # dst_node.children.map {|child| child.name } # => ["baz", "foo", "bar"] + # + # When +object+ is a DocumentFragment, + # creates a NodeSet object from the DocumentFragment; + # appends each of its nodes to the children of +self+; + # returns the NodeSet: + # + # src_xml = '' + # src_frag = Nokogiri::XML::DocumentFragment.parse(src_xml) + # dst_xml = '' + # dst_doc = Nokogiri::XML::Document.parse(dst_xml) + # dst_node = dst_doc.root + # # Before. + # src_frag.children.map {|child| child.name } # => ["foo", "bar"] + # src_frag.children.map {|child| child.parent.name } # => ["#document-fragment", "#document-fragment"] + # dst_node.children.map {|child| child.name } # => ["baz"] + # # Move the fragment. + # dst_node.add_child(src_frag) + # # After. + # src_frag.children.map {|child| child.name } # => [] + # dst_node.children.map {|child| child.name } # => ["baz", "foo", "bar"] + # dst_node.children.map {|child| child.parent.name } # => ["dst_root", "dst_root", "dst_root"] + # + # When +object+ is a String, + # creates a NodeSet object from the string; + # appends each of its nodes to the children of +self+; + # returns the NodeSet: + # + # src_xml = '' + # dst_xml = '' + # dst_doc = Nokogiri::XML::Document.parse(dst_xml) + # dst_node = dst_doc.root + # # Before. + # dst_node.children.map {|child| child.name } # => ["baz"] + # dst_node.children.map {|child| child.parent.name } # => ["dst_root"] + # # Add the NodeSet created from src_xml. + # dst_node.add_child(src_xml) + # # After. + # dst_node.children.map {|child| child.name } # => ["baz", "foo", "bar"] + # dst_node.children.map {|child| child.parent.name } # => ["dst_root", "dst_root", "dst_root"] + # + # Related: #<<, #after, #before, #children=, #prepend_child. # - # Also see related method +<<+. def add_child(node_or_tags) node_or_tags = coerce(node_or_tags) if node_or_tags.is_a?(XML::NodeSet) @@ -1313,7 +1438,7 @@ def ancestors(selector = nil) end #### - # Yields self and all children to +block+ recursively. + # Yields all children to +block+ recursively, then yields self. def traverse(&block) children.each { |j| j.traverse(&block) } yield(self) @@ -1584,6 +1709,8 @@ def coerce(data) EOERR end + IMPLIED_XPATH_CONTEXTS = [".//"].freeze # :nodoc: + private def keywordify(keywords) @@ -1634,8 +1761,6 @@ def inspect_attributes [:name, :namespace, :attribute_nodes, :children] end - IMPLIED_XPATH_CONTEXTS = [".//"].freeze - def add_child_node_and_reparent_attrs(node) add_child_node(node) node.attribute_nodes.find_all { |a| a.name.include?(":") }.each do |attr_node| diff --git a/lib/nokogiri/xml/parse_options.rb b/lib/nokogiri/xml/parse_options.rb index ddb7c79891..bdf069e429 100644 --- a/lib/nokogiri/xml/parse_options.rb +++ b/lib/nokogiri/xml/parse_options.rb @@ -1,87 +1,265 @@ # coding: utf-8 # frozen_string_literal: true +# :markup: markdown + module Nokogiri module XML - # Options that control the parsing behavior for XML::Document, XML::DocumentFragment, - # HTML4::Document, HTML4::DocumentFragment, XSLT::Stylesheet, and XML::Schema. + # \Class to contain options for parsing \XML or \HTML4 (but not \HTML5). + # + # 💡 Note that \HTML5 parsing has a separate, orthogonal set of options due to the API of the + # \HTML5 library used. See Nokogiri::HTML5. + # + # ## About the Examples + # + # Examples on this page assume that the following code has been executed: + # + # ``` + # require 'nokogiri' # Make Nokogiri available. + # include Nokogiri # Allow omitting leading 'Nokogiri::'. + # xml_s = "\n" # String containing XML. + # File.write('t.xml', xml_s) # File containing XML. + # html_s = "\n" # String containing HTML. + # File.write('t.html', html_s) # File containing HTML. + # ``` + # + # Examples executed via `IRB` (interactive Ruby) display \ParseOptions instances + # using method #inspect. + # + # ## Parsing Methods + # + # Each of the parsing methods performs parsing for an \XML or \HTML4 source: + # + # - Each requires a leading argument that specifies the source of the text to be parsed; + # except as noted, the argument's value may be either: # - # These options directly expose libxml2's parse options, which are all boolean in the sense that - # an option is "on" or "off". + # - A string. + # - An open IO stream (must respond to methods `read` and `close`). # - # 💡 Note that HTML5 parsing has a separate, orthogonal set of options due to the nature of the - # HTML5 specification. See Nokogiri::HTML5. + # Examples: # - # ⚠ Not all parse options are supported on JRuby. Nokogiri will attempt to invoke the equivalent + # ``` + # XML::parse(xml_s) + # HTML4.parse(html_s) + # XML::parse(File.open('t.xml')) + # HTML4.parse(File.open('t.html')) + # ``` + # + # - Each accepts a trailing optional argument `options` + # (or keyword argument `options`) + # that specifies parsing options; + # the argument's value may be either: + # + # - An integer: see [Bitmap Constants](rdoc-ref:ParseOptions@Bitmap+Constants). + # - An instance of \ParseOptions: see ParseOptions.new. + # + # Examples: + # + # ``` + # XML::parse(xml_s, options: XML::ParseOptions::STRICT) + # HTML4::parse(html_s, options: XML::ParseOptions::BIG_LINES) + # XML::parse(xml_s, options: XML::ParseOptions.new.strict) + # HTML4::parse(html_s, options: XML::ParseOptions.new.big_lines) + # ``` + # + # - Each (except as noted) accepts a block that allows parsing options to be specified; + # see [Options-Setting Blocks](rdoc-ref:ParseOptions@Options-Setting+Blocks). + # + # Certain other parsing methods use different options; + # see \HTML5. + # + # ⚠ Not all parse options are supported on JRuby. + # \Nokogiri attempts to invoke the equivalent # behavior in Xerces/NekoHTML on JRuby when it's possible. # - # == Setting and unsetting parse options + # ## Bitmap Constants # - # You can build your own combinations of parse options by using any of the following methods: + # Each of the [parsing methods](rdoc-ref:ParseOptions@Parsing+Methods) + # discussed here accept an integer argument `options` that specifies parsing options. # - # [ParseOptions method chaining] + # That integer value may be constructed using the bitmap constants defined in \ParseOptions. # - # Every option has an equivalent method in lowercase. You can chain these methods together to - # set various combinations. + # Except for `STRICT` (see note below), + # each of the bitmap constants has a non-zero value + # that represents a bit in an integer value; + # to illustrate, here are a few of the constants, displayed in binary format (base 2): # - # # Set the HUGE & PEDANTIC options - # po = Nokogiri::XML::ParseOptions.new.huge.pedantic - # doc = Nokogiri::XML::Document.parse(xml, nil, nil, po) + # ``` + # ParseOptions::RECOVER.to_s(2) # => "1" + # ParseOptions::NOENT.to_s(2) # => "10" + # ParseOptions::DTDLOAD.to_s(2) # => "100" + # ParseOptions::DTDATTR.to_s(2) # => "1000" + # ParseOptions::DTDVALID.to_s(2) # => "10000" + # ``` # - # Every option has an equivalent no{option} method in lowercase. You can call these - # methods on an instance of ParseOptions to unset the option. + # Any of these constants may be used alone to specify a single option: # - # # Set the HUGE & PEDANTIC options - # po = Nokogiri::XML::ParseOptions.new.huge.pedantic + # ``` + # ParseOptions.new(ParseOptions::DTDLOAD) + # # => # + # ParseOptions.new(ParseOptions::DTDATTR) + # # => # + # ``` # - # # later we want to modify the options - # po.nohuge # Unset the HUGE option - # po.nopedantic # Unset the PEDANTIC option + # Multiple constants may be ORed together to specify multiple options: # - # 💡 Note that some options begin with "no" leading to the logical but perhaps unintuitive - # double negative: + # ``` + # options = ParseOptions::BIG_LINES | ParseOptions::COMPACT | ParseOptions::NOCDATA + # ParseOptions.new(options) + # # => # + # ``` # - # po.nocdata # Set the NOCDATA parse option - # po.nonocdata # Unset the NOCDATA parse option + # **Note**: + # The value of constant `STRICT` is zero; + # it may be used alone to turn all options **off**: # - # 💡 Note that negation is not available for STRICT, which is itself a negation of all other - # features. + # ``` + # XML.parse('') {|options| puts options.inspect } + # # + # XML.parse('', nil, nil, ParseOptions::STRICT) {|options| puts options.inspect } + # # + # ``` # + # The single-option bitmask constants are: + # BIG_LINES, + # COMPACT, + # DTDATTR, + # DTDLOAD, + # DTDVALID, + # HUGE, + # NOBASEFIX, + # NOBLANKS, + # NOCDATA, + # NOENT, + # NOERROR, + # NONET, + # NOWARNING, + # NOXINCNODE, + # NSCLEAN, + # OLD10, + # PEDANTIC, + # RECOVER, + # STRICT, + # XINCLUDE. # - # [Using Ruby Blocks] + # There are also several "shorthand" constants that can set multiple options: + # DEFAULT_HTML, + # DEFAULT_SCHEMA, + # DEFAULT_XML, + # DEFAULT_XSLT. # - # Most parsing methods will accept a block for configuration of parse options, and we - # recommend chaining the setter methods: + # Examples: # - # doc = Nokogiri::XML::Document.parse(xml) { |config| config.huge.pedantic } + # ``` + # ParseOptions.new(ParseOptions::DEFAULT_HTML) + # # => # + # ParseOptions.new(ParseOptions::DEFAULT_SCHEMA) + # # => # + # ParseOptions.new(ParseOptions::DEFAULT_XML) + # # => # + # ParseOptions.new(ParseOptions::DEFAULT_XSLT) + # # => # # + # ``` # + # \Nokogiri itself uses these shorthand constants for its parsing, + # and they are generally most suitable for \Nokogiri users' code. # - # [ParseOptions constants] + # ## Options-Setting Blocks + # + # Many of the [parsing methods](rdoc-ref:ParseOptions@Parsing+Methods) + # discussed here accept an options-setting block. + # + # The block is called with a new instance of \ParseOptions + # created with the defaults for the specific method: + # + # ``` + # XML::parse(xml_s) {|options| puts options.inspect } + # # + # HTML4::parse(html_s) {|options| puts options.inspect } + # # + # ``` + # + # When the block returns, the parsing is performed using those `options`. + # + # The block may modify those options, which affects parsing: + # + # ``` + # bad_xml = '' # End tag missing. + # XML::parse(bad_xml) # No error because option RECOVER is on. + # XML::parse(bad_xml) {|options| options.strict } # Raises SyntaxError because option STRICT is on. + # ``` + # + # ## Convenience Methods + # + # A \ParseOptions object has three sets of convenience methods, + # each based on the name of one of the constants: + # + # - **Setters**: each is the downcase of an option name, and turns **on** an option: + # + # ``` + # options = ParseOptions.new + # # => # + # options.big_lines + # # => # + # options.compact + # # => # + # ``` + # + # - **Unsetters**: each begins with `no`, and turns **off** an option. + # + # Note that there is no unsetter `nostrict`, + # but the setter `recover` serves the same purpose: + # + # ``` + # options.nobig_lines + # # => # + # options.nocompact + # # => # + # options.recover # Functionally equivalent to nostrict. + # # => # + # options.noent # Set NOENT. + # # => # + # options.nonoent # Unset NOENT. + # # => # + # ``` + # + # 💡 Note that some options begin with `no`, leading to the logical but perhaps unintuitive + # double negative: + # + # ``` + # po.nocdata # Set the NOCDATA parse option + # po.nonocdata # Unset the NOCDATA parse option + # ``` # - # You can also use the constants declared under Nokogiri::XML::ParseOptions to set various - # combinations. They are bits in a bitmask, and so can be combined with bitwise operators: + # - **Queries**: each ends with `?`, and returns whether an option is **on** or **off**: # - # po = Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::HUGE | Nokogiri::XML::ParseOptions::PEDANTIC) - # doc = Nokogiri::XML::Document.parse(xml, nil, nil, po) + # ``` + # options.recover? # => true + # options.strict? # => false + # ``` + # + # Each setter and unsetter method returns `self`, + # so the methods may be chained: + # + # ``` + # options.compact.big_lines + # # => # + # ``` # class ParseOptions - # Strict parsing + # Strict parsing; do not recover from errors in input. STRICT = 0 - # Recover from errors. On by default for XML::Document, XML::DocumentFragment, - # HTML4::Document, HTML4::DocumentFragment, XSLT::Stylesheet, and XML::Schema. + # Recover from errors in input; no strict parsing. RECOVER = 1 << 0 # Substitute entities. Off by default. - # - # ⚠ This option enables entity substitution, contrary to what the name implies. - # - # ⚠ It is UNSAFE to set this option when parsing untrusted documents. + # ⚠ This option enables entity substitution, contrary to what the name implies. + # ⚠ It is UNSAFE to set this option when parsing untrusted documents. NOENT = 1 << 1 # Load external subsets. On by default for XSLT::Stylesheet. - # - # ⚠ It is UNSAFE to set this option when parsing untrusted documents. + # ⚠ It is UNSAFE to set this option when parsing untrusted documents. DTDLOAD = 1 << 2 # Default DTD attributes. On by default for XSLT::Stylesheet. @@ -90,10 +268,10 @@ class ParseOptions # Validate with the DTD. Off by default. DTDVALID = 1 << 4 - # Suppress error reports. On by default for HTML4::Document and HTML4::DocumentFragment + # Suppress error reports. On by default for HTML4::Document and HTML4::DocumentFragment. NOERROR = 1 << 5 - # Suppress warning reports. On by default for HTML4::Document and HTML4::DocumentFragment + # Suppress warning reports. On by default for HTML4::Document and HTML4::DocumentFragment. NOWARNING = 1 << 6 # Enable pedantic error reporting. Off by default. @@ -103,19 +281,19 @@ class ParseOptions NOBLANKS = 1 << 8 # Use the SAX1 interface internally. Off by default. - SAX1 = 1 << 9 + SAX1 = 1 << 9 # :nodoc: # Implement XInclude substitution. Off by default. XINCLUDE = 1 << 10 - # Forbid network access. On by default for XML::Document, XML::DocumentFragment, + # Forbid network access. + # On by default for XML::Document, XML::DocumentFragment, # HTML4::Document, HTML4::DocumentFragment, XSLT::Stylesheet, and XML::Schema. - # - # ⚠ It is UNSAFE to unset this option when parsing untrusted documents. + # ⚠ It is UNSAFE to unset this option when parsing untrusted documents. NONET = 1 << 11 # Do not reuse the context dictionary. Off by default. - NODICT = 1 << 12 + NODICT = 1 << 12 # :nodoc: # Remove redundant namespaces declarations. Off by default. NSCLEAN = 1 << 13 @@ -127,41 +305,85 @@ class ParseOptions NOXINCNODE = 1 << 15 # Compact small text nodes. Off by default. - # - # ⚠ No modification of the DOM tree is allowed after parsing. libxml2 may crash if you try to - # modify the tree. + # ⚠ No modification of the DOM tree is allowed after parsing. COMPACT = 1 << 16 - # Parse using XML-1.0 before update 5. Off by default + # Parse using XML-1.0 before update 5. Off by default. OLD10 = 1 << 17 - # Do not fixup XInclude xml:base uris. Off by default + # Do not fixup XInclude xml:base URIs. Off by default. NOBASEFIX = 1 << 18 # Relax any hardcoded limit from the parser. Off by default. - # - # ⚠ It is UNSAFE to set this option when parsing untrusted documents. + # ⚠ It is UNSAFE to set this option when parsing untrusted documents. HUGE = 1 << 19 - # Support line numbers up to long int (default is a short int). On - # by default for for XML::Document, XML::DocumentFragment, HTML4::Document, + # Support line numbers up to `long int` (default is a `short int`). + # On by default for for XML::Document, XML::DocumentFragment, HTML4::Document, # HTML4::DocumentFragment, XSLT::Stylesheet, and XML::Schema. BIG_LINES = 1 << 22 - # The options mask used by default for parsing XML::Document and XML::DocumentFragment + # Shorthand options mask useful for parsing XML: + # sets RECOVER, NONET, BIG_LINES. DEFAULT_XML = RECOVER | NONET | BIG_LINES - # The options mask used by default used for parsing XSLT::Stylesheet + # Shorthand options mask useful for parsing XSLT stylesheets: + # sets RECOVER, NONET, NOENT, DTDLOAD, DTDATTR, NOCDATA, BIG_LINES. DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA | BIG_LINES - # The options mask used by default used for parsing HTML4::Document and HTML4::DocumentFragment + # Shorthand options mask useful for parsing HTML4: + # sets RECOVER, NOERROR, NOWARNING, NONET, BIG_LINES. DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET | BIG_LINES - # The options mask used by default used for parsing XML::Schema + # Shorthand options mask useful for parsing \XML schemas: + # sets NONET, BIG_LINES. DEFAULT_SCHEMA = NONET | BIG_LINES + # Returns or sets and returns the integer value of `self`: + # + # ``` + # options = ParseOptions.new(ParseOptions::DEFAULT_HTML) + # # => # 4196449 + # options.options = ParseOptions::STRICT + # options.options # => 0 + # ``` + # attr_accessor :options + # :markup: markdown + # + # :call-seq: + # ParseOptions.new(options = ParseOptions::STRICT) + # + # Returns a new \ParseOptions object with options as specified by integer argument `options`. + # The value of `options` may be constructed + # using [Bitmap Constants](rdoc-ref:ParseOptions@Bitmap+Constants). + # + # With the simple constant `ParseOptions::STRICT` (the default), all options are **off** + # (`strict` means `norecover`): + # + # ``` + # ParseOptions.new + # # => # + # ``` + # + # With a different simple constant, one option may be set: + # + # ``` + # ParseOptions.new(ParseOptions::RECOVER) + # # => # + # ParseOptions.new(ParseOptions::COMPACT) + # # => # + # ``` + # + # With multiple ORed constants, multiple options may be set: + # + # ``` + # options = ParseOptions::COMPACT | ParseOptions::RECOVER | ParseOptions::BIG_LINES + # ParseOptions.new(options) + # # => # + # ``` def initialize(options = STRICT) @options = options end @@ -186,21 +408,88 @@ def #{constant.downcase}? RUBY end + # :call-seq: + # strict + # + # Turns **off** option `recover`: + # + # ``` + # options = ParseOptions.new.recover.compact.big_lines + # # => # + # options.strict + # # => # + # ``` def strict @options &= ~RECOVER self end + # :call-seq: + # strict? + # + # Returns whether option `strict` is **on**: + # + # ``` + # options = ParseOptions.new.recover.compact.big_lines + # # => # + # options.strict? # => false + # options.strict + # # => # + # options.strict? # => true + # ``` def strict? @options & RECOVER == STRICT end + # :call-seq: + # self == object + # + # Returns true if the same options are set in `self` and `object`. + # + # ``` + # options = ParseOptions.new + # # => # + # options == options.dup # => true + # options == options.dup.recover # => false + # ``` + # def ==(other) other.to_i == to_i end alias_method :to_i, :options + # :call-seq: + # inspect + # + # Returns a string representation of +self+ that includes + # the numeric value of `@options`: + # + # ``` + # options = ParseOptions.new + # options.inspect + # # => "#" + # + # ``` + # + # In general, the returned string also includes the (downcased) names of the options + # that are **on** (but omits the names of those that are **off**): + # + # ``` + # options.recover.big_lines + # options.inspect + # # => "#" + # ``` + # + # The exception is that always either `recover` (i.e, *not strict*) + # or the pseudo-option `strict` is reported: + # + # ``` + # options.norecover + # options.inspect + # # => "#" + # ``` + # def inspect options = [] self.class.constants.each do |k| diff --git a/lib/nokogiri/xml/searchable.rb b/lib/nokogiri/xml/searchable.rb index 2985fa63d1..646813644b 100644 --- a/lib/nokogiri/xml/searchable.rb +++ b/lib/nokogiri/xml/searchable.rb @@ -50,6 +50,11 @@ module Searchable # }.new # node.search('.//title[nokogiri:regex(., "\w+")]', 'div.employee:regex("[0-9]+")', handler) # + # ⚠ NOTE that the #search method may not always correctly detect whether the input is a CSS + # selector or an XPath expression; and the heuristic used may change in the future. The authors + # strongly recommend using Searchable#css when you know you're searching with a CSS selector, + # or Searchable#xpath when you know you're searching with an XPath expression. + # # See Searchable#xpath and Searchable#css for further usage help. def search(*args) paths, handler, ns, binds = extract_params(args) diff --git a/misc/native.yml b/misc/native.yml new file mode 100644 index 0000000000..b8f29dbcbe --- /dev/null +++ b/misc/native.yml @@ -0,0 +1,16 @@ +# configuration file for precompiled native gem packaging +platforms: + - aarch64-linux-gnu + - aarch64-linux-musl + - arm64-darwin + - arm-linux-gnu + - arm-linux-musl + - x64-mingw-ucrt + - x86_64-darwin + - x86_64-linux-gnu + - x86_64-linux-musl +rubies: + - "3.4" + - "3.3" + - "3.2" + - "3.1" diff --git a/nokogiri.gemspec b/nokogiri.gemspec index e5c8b0effa..054305a1d3 100644 --- a/nokogiri.gemspec +++ b/nokogiri.gemspec @@ -59,6 +59,7 @@ Gem::Specification.new do |spec| "README.md", "bin/nokogiri", "dependencies.yml", + "doc/keyword_arguments.md", "ext/java/nokogiri/Html4Document.java", "ext/java/nokogiri/Html4ElementDescription.java", "ext/java/nokogiri/Html4EntityLookup.java", @@ -198,15 +199,19 @@ Gem::Specification.new do |spec| "gumbo-parser/src/error.h", "gumbo-parser/src/foreign_attrs.c", "gumbo-parser/src/foreign_attrs.gperf", - "gumbo-parser/src/nokogiri_gumbo.h", + "gumbo-parser/src/hashmap.c", + "gumbo-parser/src/hashmap.h", "gumbo-parser/src/insertion_mode.h", "gumbo-parser/src/macros.h", + "gumbo-parser/src/nokogiri_gumbo.h", "gumbo-parser/src/parser.c", "gumbo-parser/src/parser.h", "gumbo-parser/src/replacement.h", "gumbo-parser/src/string_buffer.c", "gumbo-parser/src/string_buffer.h", "gumbo-parser/src/string_piece.c", + "gumbo-parser/src/string_set.c", + "gumbo-parser/src/string_set.h", "gumbo-parser/src/svg_attrs.c", "gumbo-parser/src/svg_attrs.gperf", "gumbo-parser/src/svg_tags.c", @@ -328,10 +333,6 @@ Gem::Specification.new do |spec| spec.rdoc_options = ["--main", "README.md"] if java_p - # loosen after jruby fixes https://github.com/jruby/jruby/issues/7262 - # also see https://github.com/mkristian/jar-dependencies/commit/006fb254 - spec.add_development_dependency("jar-dependencies", "= 0.4.1") - spec.require_paths << "lib/nokogiri/jruby" # where we install the jars, see the :vendor_jars rake task spec.requirements << "jar isorelax, isorelax, 20030108" # https://search.maven.org/artifact/isorelax/isorelax spec.requirements << "jar org.nokogiri, nekodtd, 0.1.11.noko2" diff --git a/oci-images/nokogiri-test/alpine.dockerfile b/oci-images/nokogiri-test/alpine.dockerfile index 42c3f3cf30..2d33ec18df 100644 --- a/oci-images/nokogiri-test/alpine.dockerfile +++ b/oci-images/nokogiri-test/alpine.dockerfile @@ -7,6 +7,9 @@ RUN apk add bash build-base git # valgrind RUN apk add valgrind +# psych +RUN apk add yaml-dev + # libxml-et-al RUN apk add libxml2-dev libxslt-dev pkgconfig diff --git a/oci-images/nokogiri-test/alpine.erb b/oci-images/nokogiri-test/alpine.erb index 7a8082c611..12e0605b0b 100644 --- a/oci-images/nokogiri-test/alpine.erb +++ b/oci-images/nokogiri-test/alpine.erb @@ -7,6 +7,9 @@ RUN apk add bash build-base git # valgrind RUN apk add valgrind +# psych +RUN apk add yaml-dev + # libxml-et-al RUN apk add libxml2-dev libxslt-dev pkgconfig diff --git a/rakelib/check-manifest.rake b/rakelib/check-manifest.rake index 33f49937da..6bc89567c6 100644 --- a/rakelib/check-manifest.rake +++ b/rakelib/check-manifest.rake @@ -38,7 +38,6 @@ task :check_manifest, [:verbose] do |_, args| [0-9]* } ignore_files = %w[ - .cross_rubies .editorconfig .gitignore .gitmodules diff --git a/rakelib/extensions.rake b/rakelib/extensions.rake index 232291c354..5790211580 100644 --- a/rakelib/extensions.rake +++ b/rakelib/extensions.rake @@ -2,6 +2,8 @@ require "rbconfig" require "shellwords" +require "rake_compiler_dock" +require "yaml" CrossRuby = Struct.new(:version, :platform) do LINUX_PLATFORM_REGEX = /linux/ @@ -166,16 +168,14 @@ CrossRuby = Struct.new(:version, :platform) do end end -CROSS_RUBIES = File.read(".cross_rubies").split("\n").filter_map do |line| - case line - when /\A([^#]+):([^#]+)/ - CrossRuby.new(Regexp.last_match(1), Regexp.last_match(2)) +native_config = YAML.load_file("misc/native.yml") +CROSS_RUBIES = native_config["platforms"].flat_map do |platform| + native_config["rubies"].map do |minor| + version = RakeCompilerDock.cross_rubies[minor] + CrossRuby.new(version, platform) end end - -ENV["RUBY_CC_VERSION"] = CROSS_RUBIES.map(&:ver).uniq.join(":") - -require "rake_compiler_dock" +RakeCompilerDock.set_ruby_cc_version(*native_config["rubies"]) def java? RUBY_PLATFORM.include?("java") @@ -375,7 +375,6 @@ if java? end else require "rake/extensiontask" - require "yaml" dependencies = YAML.load_file("dependencies.yml") diff --git a/rakelib/rdoc.rake b/rakelib/rdoc.rake index 289bb755de..61938a096e 100644 --- a/rakelib/rdoc.rake +++ b/rakelib/rdoc.rake @@ -1,26 +1,30 @@ # frozen_string_literal: true -require "rdoc/task" +begin + require "rdoc/task" -def rdoc_nokogiri_common_options(rdoc) - rdoc.rdoc_files - .include("README.md", "lib/**/*.rb", "ext/**/*.c") - .exclude("ext/nokogiri/test_global_handlers.c") - rdoc.options << "--embed-mixins" - rdoc.options << "--main=README.md" -end + def rdoc_nokogiri_common_options(rdoc) + rdoc.rdoc_files + .include("*.md", "lib/**/*.rb", "ext/**/*.c", "doc/**/*md") + .exclude("CHANGELOG.md", "ROADMAP.md", "ext/nokogiri/test_global_handlers.c") + rdoc.options << "--embed-mixins" + rdoc.options << "--main=README.md" + end -RDoc::Task.new(rdoc: "rdoc", clobber_rdoc: "rdoc:clean", rerdoc: "rdoc:force") do |rdoc| - rdoc.rdoc_dir = ENV["RDOC_DIR"] || "html" - rdoc.options << "--show-hash" - rdoc.options << "--template-stylesheets=misc/rdoc-tweaks.css" - rdoc_nokogiri_common_options(rdoc) -end + RDoc::Task.new(rdoc: "rdoc", clobber_rdoc: "rdoc:clean", rerdoc: "rdoc:force") do |rdoc| + rdoc.rdoc_dir = ENV["RDOC_DIR"] || "html" + rdoc.options << "--show-hash" + rdoc.options << "--template-stylesheets=misc/rdoc-tweaks.css" + rdoc_nokogiri_common_options(rdoc) + end -RDoc::Task.new(rdoc: "ri", clobber_rdoc: "ri:clean", rerdoc: "ri:force") do |rdoc| - rdoc.rdoc_dir = ENV["RI_DIR"] || "ri" - rdoc.generator = "ri" - rdoc_nokogiri_common_options(rdoc) -end + RDoc::Task.new(rdoc: "ri", clobber_rdoc: "ri:clean", rerdoc: "ri:force") do |rdoc| + rdoc.rdoc_dir = ENV["RI_DIR"] || "ri" + rdoc.generator = "ri" + rdoc_nokogiri_common_options(rdoc) + end -task clean: "rdoc:clean" # rubocop:disable Rake/Desc + task clean: "rdoc:clean" # rubocop:disable Rake/Desc +rescue LoadError => e + warn("WARNING: rdoc is not available in this environment: #{e}") +end diff --git a/scripts/test-gem-file-contents b/scripts/test-gem-file-contents index ba167fdbb1..45099dd143 100755 --- a/scripts/test-gem-file-contents +++ b/scripts/test-gem-file-contents @@ -16,8 +16,10 @@ require "bundler/inline" gemfile do source "https://rubygems.org" gem "minitest" + gem "rake-compiler-dock", ">= 1.9.1" end +require "rake_compiler_dock" require "yaml" def usage_and_exit(message = nil) @@ -68,22 +70,10 @@ require "minitest/autorun" puts "Testing '#{gemfile}' (#{gemspec.platform})" describe File.basename(gemfile) do - let(:cross_rubies_path) { File.join(File.dirname(__FILE__), "..", ".cross_rubies") } + let(:native_config) { YAML.load_file(File.join(__dir__, "..", "misc", "native.yml")) } - let(:platform_supported_ruby_versions) do - File.read(cross_rubies_path).split("\n").filter_map do |line| - ver, plat = line.split(":") - next if plat != gemspec.platform.to_s - - ver.split(".").take(2).join(".") # ugh - end.uniq.sort - end - - let(:all_supported_ruby_versions) do - File.read(cross_rubies_path).split("\n").map do |line| - ver, _ = line.split(":") - ver.split(".").take(2).join(".") # ugh - end.uniq.sort + let(:supported_ruby_versions) do + native_config["rubies"] end describe "setup" do @@ -93,7 +83,7 @@ describe File.basename(gemfile) do end it "gemspec is a Gem::Specification" do - assert_equal(Gem::Specification, gemspec.class) + assert_instance_of(Gem::Specification, gemspec) end end @@ -194,7 +184,7 @@ describe File.basename(gemfile) do end it "contains expected shared library files " do - platform_supported_ruby_versions.each do |version| + supported_ruby_versions.each do |version| actual = gemfile_contents.find do |p| File.fnmatch?("lib/nokogiri/#{version}/nokogiri.{so,bundle}", p, File::FNM_EXTGLOB) end @@ -210,15 +200,16 @@ describe File.basename(gemfile) do File.fnmatch?("lib/nokogiri/**/*.{so,bundle}", p, File::FNM_EXTGLOB) end assert_equal( - platform_supported_ruby_versions.length, + supported_ruby_versions.length, actual.length, "did not expect extra shared library files", ) end it "sets required_ruby_version appropriately" do - unsupported_versions = all_supported_ruby_versions - platform_supported_ruby_versions - platform_supported_ruby_versions.each do |v| + all_supported_ruby_versions = RakeCompilerDock.cross_rubies.keys + unsupported_versions = all_supported_ruby_versions - supported_ruby_versions + supported_ruby_versions.each do |v| assert( gemspec.required_ruby_version.satisfied_by?(Gem::Version.new(v)), "required_ruby_version='#{gemspec.required_ruby_version}' should support ruby #{v}", diff --git a/scripts/test-gem-install b/scripts/test-gem-install index 8528dc0475..6d3394d2f6 100755 --- a/scripts/test-gem-install +++ b/scripts/test-gem-install @@ -28,6 +28,7 @@ popd # 2.3.21 because https://github.com/rubygems/rubygems/issues/5914 # 2.3.22 because https://github.com/rubygems/rubygems/issues/5940 gem install bundler -v "~> 2.2, != 2.3.21, != 2.3.22" +bundle config set --local without rdoc bundle install --local || bundle install rm -rf lib ext # ensure we don't use the local files diff --git a/scripts/test-gem-installation b/scripts/test-gem-installation index 81f640a6a0..a3005e73f6 100755 --- a/scripts/test-gem-installation +++ b/scripts/test-gem-installation @@ -74,11 +74,11 @@ describe gemspec.full_name do describe "native platform" do it "declares packaged, precompiled libraries" do - assert(Nokogiri::VersionInfo.instance.libxml2_using_packaged?) + assert_predicate(Nokogiri::VersionInfo.instance, :libxml2_using_packaged?) assert(Nokogiri::VERSION_INFO["libxml"].key?("source")) assert_equal("packaged", Nokogiri::VERSION_INFO["libxml"]["source"]) - assert(Nokogiri::VersionInfo.instance.libxml2_precompiled?) + assert_predicate(Nokogiri::VersionInfo.instance, :libxml2_precompiled?) assert(Nokogiri::VERSION_INFO["libxml"].key?("precompiled")) assert(Nokogiri::VERSION_INFO["libxml"]["precompiled"]) end diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb index 7d87f523bc..c852d5e840 100644 --- a/test/css/test_xpath_visitor.rb +++ b/test/css/test_xpath_visitor.rb @@ -143,53 +143,68 @@ def visit_pseudo_class_aaron(node) ) end - it "# id" do - assert_xpath("//*[@id='foo']", "#foo") - assert_xpath("//*[@id='escape:needed,']", "#escape\\:needed\\,") - assert_xpath("//*[@id='escape:needed,']", '#escape\3Aneeded\,') - assert_xpath("//*[@id='escape:needed,']", '#escape\3A needed\2C') - assert_xpath("//*[@id='escape:needed']", '#escape\00003Aneeded') - end - - describe "attribute" do - it "basic mechanics" do - assert_xpath("//h1[@a='Tender Lovemaking']", "h1[a='Tender Lovemaking']") - assert_xpath("//h1[@a]", "h1[a]") - assert_xpath(%q{//h1[@a='gnewline\n']}, "h1[a='\\gnew\\\nline\\\\n']") - assert_xpath("//h1[@a='test']", %q{h1[a=\te\st]}) - end - - it "parses leading @ (extended-syntax)" do - assert_xpath("//a[@id='Boing']", "a[@id='Boing']") - assert_xpath("//a[@id='Boing']", "a[@id = 'Boing']") - assert_xpath("//a[@id='Boing']//div", "a[@id='Boing'] div") + describe "namespaces" do + let(:ns) do + { + "xmlns" => "http://default.example.com/", + "hoge" => "http://hoge.example.com/", + } end - it "namespacing" do + it "basic mechanics" do assert_xpath("//a[@flavorjones:href]", "a[flavorjones|href]") assert_xpath("//a[@href]", "a[|href]") assert_xpath("//*[@flavorjones:href]", "*[flavorjones|href]") + end - ns = { - "xmlns" => "http://default.example.com/", - "hoge" => "http://hoge.example.com/", - } - - # An intentionally-empty namespace means "don't use the default xmlns" - assert_equal(["//a"], Nokogiri::CSS.xpath_for("|a", ns: ns, cache: false)) - - # The default namespace is not applied to attributes (just elements) + it "default namespace is applied to elements but not attributes" do assert_equal( ["//xmlns:a[@class='bar']"], Nokogiri::CSS.xpath_for("a[class='bar']", ns: ns, cache: false), ) + end - # We can explicitly apply a namespace to an attribue + it "default namespace is not applied to wildcard selectors" do + assert_equal( + ["//xmlns:a//*"], + Nokogiri::CSS.xpath_for("a *", ns: ns, cache: false), + ) + end + + it "intentionally-empty namespace omits the default xmlns" do + # An intentionally-empty namespace + assert_equal(["//a"], Nokogiri::CSS.xpath_for("|a", ns: ns, cache: false)) + end + + it "explicit namespaces are applied to attributes" do assert_equal( ["//xmlns:a[@hoge:class='bar']"], Nokogiri::CSS.xpath_for("a[hoge|class='bar']", ns: ns, cache: false), ) end + end + + describe "attribute" do + it "basic mechanics" do + assert_xpath("//h1[@a='Tender Lovemaking']", "h1[a='Tender Lovemaking']") + assert_xpath("//h1[@a]", "h1[a]") + assert_xpath(%q{//h1[@a='gnewline\n']}, "h1[a='\\gnew\\\nline\\\\n']") + assert_xpath("//h1[@a='test']", %q{h1[a=\te\st]}) + end + + it "#id escaping" do + assert_xpath("//*[@id='foo']", "#foo") + assert_xpath("//*[@id='escape:needed,']", "#escape\\:needed\\,") + assert_xpath("//*[@id='escape:needed,']", '#escape\3Aneeded\,') + assert_xpath("//*[@id='escape:needed,']", '#escape\3A needed\2C') + assert_xpath("//*[@id='escape:needed']", '#escape\00003Aneeded') + end + + it "parses leading @ (extended-syntax)" do + assert_xpath("//a[@id='Boing']", "a[@id='Boing']") + assert_xpath("//a[@id='Boing']", "a[@id = 'Boing']") + assert_xpath("//a[@id='Boing']//div", "a[@id='Boing'] div") + end it "rhs with quotes" do assert_xpath(%q{//h1[@a="'"]}, %q{h1[a="'"]}) diff --git a/test/helper.rb b/test/helper.rb index 24f3023d80..a74e0920c7 100644 --- a/test/helper.rb +++ b/test/helper.rb @@ -266,19 +266,19 @@ def util_decorate(document, decorator_module) document.decorate! end - def pending(msg) + def pending(msg, extra_uplevel = 0) begin yield rescue Minitest::Assertion - skip("pending #{msg} [#{caller(2..2).first}]") + skip("pending #{msg} [#{caller(2 + extra_uplevel, 1).first}]") end - flunk("pending test unexpectedly passed: #{msg} [#{caller(1..1).first}]") + flunk("pending test unexpectedly passed: #{msg} [#{caller(1 + extra_uplevel, 1).first}]") end def pending_if(msg, pend_eh, &block) return yield unless pend_eh - pending(msg, &block) + pending(msg, 1, &block) end # returns the page size in bytes diff --git a/test/html4/sax/test_push_parser.rb b/test/html4/sax/test_push_parser.rb index 6eb83c8339..cd37bad1d3 100644 --- a/test/html4/sax/test_push_parser.rb +++ b/test/html4/sax/test_push_parser.rb @@ -7,7 +7,7 @@ let(:parser) { Nokogiri::HTML4::SAX::PushParser.new(Nokogiri::SAX::TestCase::Doc.new) } it :test_end_document_called do - parser << (<<~HTML) + parser << <<~HTML

    Paragraph 1 @@ -19,7 +19,7 @@ end it :test_start_element do - parser << (<<~HTML) + parser << <<~HTML

    HTML @@ -29,7 +29,7 @@ parser.document.start_elements, ) - parser << (<<~HTML) + parser << <<~HTML Paragraph 1

    @@ -39,11 +39,11 @@ end it :test_chevron_partial_html do - parser << (<<~HTML) + parser << <<~HTML

    HTML - parser << (<<-HTML) + parser << <<~HTML Paragraph 1

    @@ -53,7 +53,7 @@ end it :test_chevron do - parser << (<<~HTML) + parser << <<~HTML

    Paragraph 1 diff --git a/test/html4/test_comments.rb b/test/html4/test_comments.rb index 5fc4d55d0e..7717e67c53 100644 --- a/test/html4/test_comments.rb +++ b/test/html4/test_comments.rb @@ -250,7 +250,7 @@ class TestComment < Nokogiri::TestCase { name: "div", children: [ { name: "comment", content: "[if foo]" }, - { name: "div", attributes: [{name: "id", value: "do-i-exist"}] }, + { name: "div", attributes: [{ name: "id", value: "do-i-exist" }] }, { name: "comment", content: "[endif]" }, ] } diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb index 57ad1193b3..c9a8c8774e 100644 --- a/test/html4/test_document_fragment.rb +++ b/test/html4/test_document_fragment.rb @@ -193,9 +193,9 @@ def test_malformed_fragment_is_corrected assert_pattern do fragment => [ { name: "div", attributes: [ - { name: "<", value: ""}, - { name: "div", value: ""}, - ]} + { name: "<", value: "" }, + { name: "div", value: "" }, + ] } ] end else diff --git a/test/html4/test_node.rb b/test/html4/test_node.rb index 45ee8feb19..be00518e64 100644 --- a/test/html4/test_node.rb +++ b/test/html4/test_node.rb @@ -23,7 +23,15 @@ def test_to_a def test_attr node = @html.at("div.baz") - assert_equal(node["class"], node.attr("class")) + assert_equal("baz", node["class"]) + assert_equal("baz", node.attr("class")) + end + + def test_attribute + # https://github.com/sparklemotion/nokogiri/issues/3487 + node = @html.at("div.baz") + refute_nil(node.attribute("class")) + assert_equal("baz", node.attribute("class").value) end def test_get_attribute diff --git a/test/html5/test_attributes.rb b/test/html5/test_attributes.rb index 0dbfb6f13e..9cef22c7ed 100644 --- a/test/html5/test_attributes.rb +++ b/test/html5/test_attributes.rb @@ -15,4 +15,16 @@ def test_serialize_attribute assert_equal('id="foo"', id_attr.to_html) assert_equal('class="bar baz"', class_attr.to_html) end + + def test_duplicate_attributes + html = +"" + span = Nokogiri::HTML5::DocumentFragment.parse(html, max_attributes: 1000).at_css("span") + + assert_equal(676, span.attributes.length, "duplicate attribute should be silently ignored") + assert_equal("1", span["bb"], "bb attribute should hold the value of the first occurrence") + end end if Nokogiri.uses_gumbo? diff --git a/test/namespaces/test_namespace_definitions.rb b/test/namespaces/test_namespace_definitions.rb index 7df6ea8608..77bf887f1e 100644 --- a/test/namespaces/test_namespace_definitions.rb +++ b/test/namespaces/test_namespace_definitions.rb @@ -26,12 +26,10 @@ end child2 = doc.create_element("b", "xmlns:foo" => "http://nokogiri.org/ns/foo") - pending_if("https://github.com/sparklemotion/nokogiri/issues/2543", Nokogiri.jruby?) do - assert_equal(1, child2.namespace_definitions.length) - child2.namespace_definitions.first.tap do |ns| - assert_equal("foo", ns.prefix) - assert_equal("http://nokogiri.org/ns/foo", ns.href) - end + assert_equal(1, child2.namespace_definitions.length) + child2.namespace_definitions.first.tap do |ns| + assert_equal("foo", ns.prefix) + assert_equal("http://nokogiri.org/ns/foo", ns.href) end end end diff --git a/test/namespaces/test_serializing_namespaces.rb b/test/namespaces/test_serializing_namespaces.rb new file mode 100644 index 0000000000..1a1a4f5515 --- /dev/null +++ b/test/namespaces/test_serializing_namespaces.rb @@ -0,0 +1,251 @@ +# frozen_string_literal: true + +require "helper" + +# This test uses Nokogiri::XML::Builder for code terseness. +# The test is primarily intended to test serialization behavior, +# not tree construction. +describe "serializing namespaces" do + it "does not repeat xmlns definitions in child elements" do + # https://github.com/sparklemotion/nokogiri/issues/3455 + doc = Nokogiri::XML::Builder.new do |xml| + xml["ds"].Signature("xmlns:ds" => "http://www.w3.org/2000/09/xmldsig#") do + xml["ds"].SignatureValue("foobar") do + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "foobar") + end + + it "does not repeat xmlns definitions even when explicitly defined" do + doc = Nokogiri::XML::Builder.new do |xml| + xml["ds"].Signature("xmlns:ds" => "http://www.w3.org/2000/09/xmldsig#") do + xml["ds"].SignatureValue("foobar", "xmlns:ds" => "http://www.w3.org/2000/09/xmldsig#") do + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "foobar") + end + + it "redeclares xmlns definitions when shadowed" do + doc = Nokogiri::XML::Builder.new do |xml| + xml["dnd"].adventure("xmlns:dnd" => "http://www.w3.org/dungeons#") do + xml["dnd"].party("xmlns:dnd" => "http://www.w3.org/dragons#") do + xml["dnd"].members("xmlns:dnd" => "http://www.w3.org/dragons#") do + xml["dnd"].character("xmlns:dnd" => "http://www.w3.org/dungeons#") do + xml["dnd"].name("Nigel", "xmlns:dnd" => "http://www.w3.org/dungeons#") + end + end + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, '') + assert_includes(doc, "") + assert_includes(doc, '') + assert_includes(doc, "Nigel") + end + + describe "default namespaces" do + it "properly handles default namespaces" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root(xmlns: "http://default-namespace.org/") do + xml.child("with default namespace") + xml["specific"].child("with specific namespace", "xmlns:specific" => "http://specific-namespace.org/") + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "with default namespace") + assert_includes(doc, + 'with specific namespace') + end + + it "handles nested default namespaces" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root(xmlns: "http://outer-namespace.org/") do + xml.outer("in outer namespace") + xml.inner(xmlns: "http://inner-namespace.org/") do + xml.element("in inner namespace") + end + xml.another("back in outer namespace") + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "in outer namespace") + assert_includes(doc, '') + pending_if("https://github.com/sparklemotion/nokogiri/issues/3457", Nokogiri.jruby?) do + # Here JRuby is incorrectly adding the xmlns namespace declaration, i.e.: + # 'in inner namespace' + assert_includes(doc, "in inner namespace") + end + assert_includes(doc, "back in outer namespace") + end + end + + describe "multiple namespaces on elements" do + it "handles multiple namespaces declared on a single element" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root do + xml.element( + "xmlns:ns1" => "http://namespace1.org/", + "xmlns:ns2" => "http://namespace2.org/", + "xmlns:ns3" => "http://namespace3.org/", + ) do + xml["ns1"].first("using first namespace") + xml["ns2"].second("using second namespace") + xml["ns3"].third("using third namespace") + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "using first namespace") + assert_includes(doc, "using second namespace") + assert_includes(doc, "using third namespace") + end + + it "handles multiple namespaces declared on middle elements" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root("xmlns:top" => "http://top-namespace.org/") do + xml["top"].level1 do + xml.middle( + "xmlns:mid1" => "http://middle1-namespace.org/", + "xmlns:mid2" => "http://middle2-namespace.org/", + ) do + xml["mid1"].item("using middle1 namespace") + xml["mid2"].item("using middle2 namespace") + xml["top"].item("still using top namespace") + + xml.bottom("xmlns:bot" => "http://bottom-namespace.org/") do + xml["bot"].item("using bottom namespace") + xml["mid1"].item("still using middle1 namespace") + end + end + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "") + assert_includes(doc, '') + assert_includes(doc, "using middle1 namespace") + assert_includes(doc, "using middle2 namespace") + assert_includes(doc, "still using top namespace") + assert_includes(doc, '') + assert_includes(doc, "using bottom namespace") + assert_includes(doc, "still using middle1 namespace") + end + end + + describe "namespace scope and visibility" do + it "handles namespace prefixes reused with different URIs" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root do + xml.outer("xmlns:ns" => "http://outer-uri.org/") do + xml["ns"].element("outer namespace") + + xml.inner("xmlns:ns" => "http://inner-uri.org/") do + xml["ns"].element("inner namespace") + end + + xml["ns"].another("back to outer namespace") + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "outer namespace") + assert_includes(doc, '') + assert_includes(doc, "inner namespace") + assert_includes(doc, "back to outer namespace") + end + + it "handles mixing default and prefixed namespaces" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root(:xmlns => "http://default.org/", "xmlns:ns" => "http://prefixed.org/") do + xml.default_element("in default namespace") + xml["ns"].prefixed_element("in prefixed namespace") + + xml.mixed(xmlns: "http://new-default.org/") do + xml.new_default("in new default namespace") + xml["ns"].still_prefixed("still using original prefixed namespace") + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "in default namespace") + assert_includes(doc, "in prefixed namespace") + assert_includes(doc, '') + pending_if("https://github.com/sparklemotion/nokogiri/issues/3457", Nokogiri.jruby?) do + # Here JRuby is incorrectly adding the xmlns namespace declaration, i.e.: + # 'in new default namespace' + assert_includes(doc, "in new default namespace") + end + assert_includes(doc, "still using original prefixed namespace") + end + end + + describe "namespace inheritance" do + it "inherits namespaces from ancestors without redeclaring them" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root("xmlns:a" => "http://a.org/", "xmlns:b" => "http://b.org/") do + xml["a"].first do + xml["a"].second do + xml["b"].inner("using b namespace inside a") + end + end + xml["b"].third do + xml["a"].inner("using a namespace inside b") + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, "") + assert_includes(doc, "") + assert_includes(doc, "using b namespace inside a") + assert_includes(doc, "") + assert_includes(doc, "using a namespace inside b") + + # Ensure namespaces aren't redundantly declared + assert_equal(1, doc.scan('xmlns:a="http://a.org/"').count) + assert_equal(1, doc.scan('xmlns:b="http://b.org/"').count) + end + + it "works with namespace declarations at different levels of the hierarchy" do + doc = Nokogiri::XML::Builder.new do |xml| + xml.root("xmlns:top" => "http://top.org/") do + xml["top"].level1 do + xml["top"].level2("xmlns:mid" => "http://mid.org/") do + xml["mid"].item1 + xml["top"].item2 + + xml["mid"].container("xmlns:deep" => "http://deep.org/") do + xml["deep"].deepest + xml["mid"].stillMid + xml["top"].stillTop + end + end + end + end + end.doc.to_xml(save_with: Nokogiri::XML::Node::SaveOptions::AS_XML) + + assert_includes(doc, '') + assert_includes(doc, '') + assert_includes(doc, "") + assert_includes(doc, "") + assert_includes(doc, '') + assert_includes(doc, "") + assert_includes(doc, "") + assert_includes(doc, "") + end + end +end diff --git a/test/test_memory_usage.rb b/test/test_memory_usage.rb index a23bb675e0..1b2044b448 100644 --- a/test/test_memory_usage.rb +++ b/test/test_memory_usage.rb @@ -313,5 +313,29 @@ def start_element(name, attrs = []) # Expected error. This comment makes rubocop happy. end end + + it "XML::SAX::ParserContext.io holds a reference to IO input" do + content = File.read(XML_ATOM_FILE) + + memwatch(__method__) do + pc = Nokogiri::XML::SAX::ParserContext.io(StringIO.new(content), "ISO-8859-1") + parser = Nokogiri::XML::SAX::Parser.new(Nokogiri::SAX::TestCase::Doc.new) + GC.stress + pc.parse_with(parser) + + assert_equal(472, parser.document.data.length) + end + end + + it "XML::SAX::ParserContext.memory holds a reference to string input" do + memwatch(__method__) do + pc = Nokogiri::XML::SAX::ParserContext.memory(File.read(XML_ATOM_FILE), "ISO-8859-1") + parser = Nokogiri::XML::SAX::Parser.new(Nokogiri::SAX::TestCase::Doc.new) + GC.stress + pc.parse_with(parser) + + assert_equal(472, parser.document.data.length) + end + end end if ENV["NOKOGIRI_MEMORY_SUITE"] && Nokogiri.uses_libxml? end diff --git a/test/test_pattern_matching.rb b/test/test_pattern_matching.rb index fb2d6c2c15..2a1a6ffc1f 100644 --- a/test/test_pattern_matching.rb +++ b/test/test_pattern_matching.rb @@ -258,14 +258,14 @@ it "finds node contents" do assert_pattern do - doc => { root: { children: [*, { children: [*, {name: "grandchild1", content: }, *] }, *] } } + doc => { root: { children: [*, { children: [*, { name: "grandchild1", content: }, *] }, *] } } assert_equal("hello & goodbye", content) end end it "finds node contents by attribute" do assert_pattern do - doc => { root: { children: [*, { children: [*, {attributes: [*, {name: "size", value: "small"}, *], content: }, *] }, *] } } + doc => { root: { children: [*, { children: [*, { attributes: [*, { name: "size", value: "small" }, *], content: }, *] }, *] } } assert_equal("hello & goodbye", content) end end @@ -274,13 +274,13 @@ describe "Fragment" do it "finds nodes" do assert_pattern do - frag => [{name: "child1"}, {name: "child2"}, {name: "child3"}, {content: "\n"}] + frag => [{ name: "child1" }, { name: "child2" }, { name: "child3" }, { content: "\n" }] end end it "finds attributes" do assert_pattern do - frag => [*, {name: "child2", attributes: }, *] + frag => [*, { name: "child2", attributes: }, *] assert_equal("foo", attributes.first.name) end end @@ -289,7 +289,7 @@ describe "Node" do it "finds nodes" do assert_pattern do - doc.root => { elements: [{name: "child1"}, {name: "child2"}, {name: "child3"}] } + doc.root => { elements: [{ name: "child1" }, { name: "child2" }, { name: "child3" }] } end end end diff --git a/test/xml/node/test_attribute_methods.rb b/test/xml/node/test_attribute_methods.rb index 4d5b78cac7..fba50e7b58 100644 --- a/test/xml/node/test_attribute_methods.rb +++ b/test/xml/node/test_attribute_methods.rb @@ -193,134 +193,159 @@ def test_remove_class end describe "setup" do - it { _(node.get_attribute("noob")).must_be_nil } + it { assert_nil(node.get_attribute("noob")) } end describe "#kwattr_values" do it "returns an array of space-delimited values" do - _(node.kwattr_values("blargh")).must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"]) + assert_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"], node.kwattr_values("blargh")) end describe "when no attribute exists" do it "returns an empty array" do - _(node.kwattr_values("noob")).must_equal([]) + assert_empty(node.kwattr_values("noob")) end end describe "when an empty attribute exists" do it "returns an empty array" do node.set_attribute("noob", "") - _(node.kwattr_values("noob")).must_equal([]) + assert_empty(node.kwattr_values("noob")) node.set_attribute("noob", " ") - _(node.kwattr_values("noob")).must_equal([]) + assert_empty(node.kwattr_values("noob")) end end end describe "kwattr_add" do it "returns the node for chaining" do - _(node.kwattr_add("noob", "asdf")).must_be_same_as(node) + assert_same(node, node.kwattr_add("noob", "asdf")) end it "creates a new attribute when necessary" do - _(node.kwattr_add("noob", "asdf").get_attribute("noob")).wont_be_nil + refute_nil(node.kwattr_add("noob", "asdf").get_attribute("noob")) end it "adds a new bare keyword string" do - _(node.kwattr_add("blargh", "jimmy").kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy"], + node.kwattr_add("blargh", "jimmy").kwattr_values("blargh"), + ) end it "does not add a repeated bare keyword string" do - _(node.kwattr_add("blargh", "foo").kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"], + node.kwattr_add("blargh", "foo").kwattr_values("blargh"), + ) end describe "given a string of keywords" do it "adds new keywords and ignores existing keywords" do - _(node.kwattr_add("blargh", "foo jimmy\tjohnny").kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy", "johnny"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy", "johnny"], + node.kwattr_add("blargh", "foo jimmy\tjohnny").kwattr_values("blargh"), + ) end end describe "given an array of keywords" do it "adds new keywords and ignores existing keywords" do - _(node.kwattr_add("blargh", ["foo", "jimmy"]).kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy"], + node.kwattr_add("blargh", ["foo", "jimmy"]).kwattr_values("blargh"), + ) end end end describe "kwattr_append" do it "returns the node for chaining" do - _(node.kwattr_append("noob", "asdf")).must_be_same_as(node) + assert_same(node, node.kwattr_append("noob", "asdf")) end it "creates a new attribute when necessary" do - _(node.kwattr_append("noob", "asdf").get_attribute("noob")).wont_be_nil + refute_nil(node.kwattr_append("noob", "asdf").get_attribute("noob")) end it "adds a new bare keyword string" do - _(node.kwattr_append("blargh", "jimmy").kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "jimmy"], + node.kwattr_append("blargh", "jimmy").kwattr_values("blargh"), + ) end it "adds a repeated bare keyword string" do - _(node.kwattr_append("blargh", "foo").kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "foo"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "foo"], + node.kwattr_append("blargh", "foo").kwattr_values("blargh"), + ) end describe "given a string of keywords" do it "adds new keywords and existing keywords" do - _(node.kwattr_append("blargh", "foo jimmy\tjohnny").kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "foo", "jimmy", "johnny"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "foo", "jimmy", "johnny"], + node.kwattr_append("blargh", "foo jimmy\tjohnny").kwattr_values("blargh"), + ) end end describe "given an array of keywords" do it "adds new keywords and existing keywords" do - _(node.kwattr_append("blargh", ["foo", "jimmy"]).kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "foo", "jimmy"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx", "foo", "jimmy"], + node.kwattr_append("blargh", ["foo", "jimmy"]).kwattr_values("blargh"), + ) end end end describe "kwattr_remove" do it "returns the node for chaining" do - _(node.kwattr_remove("noob", "asdf")).must_be_same_as(node) + assert_same(node, node.kwattr_remove("noob", "asdf")) end it "gracefully handles a non-existent attribute" do - _(node.kwattr_remove("noob", "asdf").get_attribute("noob")).must_be_nil + assert_nil(node.kwattr_remove("noob", "asdf").get_attribute("noob")) end it "removes an existing bare keyword string" do - _(node.kwattr_remove("blargh", "foo").kwattr_values("blargh")) - .must_equal(["bar", "baz", "bar", "quux", "manx"]) + assert_equal( + ["bar", "baz", "bar", "quux", "manx"], + node.kwattr_remove("blargh", "foo").kwattr_values("blargh"), + ) end it "gracefully ignores a non-existent bare keyword string" do - _(node.kwattr_remove("blargh", "jimmy").kwattr_values("blargh")) - .must_equal(["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"]) + assert_equal( + ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"], + node.kwattr_remove("blargh", "jimmy").kwattr_values("blargh"), + ) end describe "given a string of keywords" do it "removes existing keywords and ignores other keywords" do - _(node.kwattr_remove("blargh", "foo jimmy\tjohnny").kwattr_values("blargh")) - .must_equal(["bar", "baz", "bar", "quux", "manx"]) + assert_equal( + ["bar", "baz", "bar", "quux", "manx"], + node.kwattr_remove("blargh", "foo jimmy\tjohnny").kwattr_values("blargh"), + ) end end describe "given an array of keywords" do it "adds new keywords and existing keywords" do - _(node.kwattr_remove("blargh", ["foo", "jimmy"]).kwattr_values("blargh")) - .must_equal(["bar", "baz", "bar", "quux", "manx"]) + assert_equal( + ["bar", "baz", "bar", "quux", "manx"], + node.kwattr_remove("blargh", ["foo", "jimmy"]).kwattr_values("blargh"), + ) end end it "removes the attribute when no values are left" do - _(node.kwattr_remove("blargh", ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"]).get_attribute("blargh")).must_be_nil + node.kwattr_remove("blargh", ["foo", "bar", "baz", "bar", "foo", "quux", "foo", "manx"]) + assert_nil(node.get_attribute("blargh")) end end end diff --git a/test/xml/node/test_save_options.rb b/test/xml/node/test_save_options.rb index 69aeb949c7..b718205a68 100644 --- a/test/xml/node/test_save_options.rb +++ b/test/xml/node/test_save_options.rb @@ -19,9 +19,9 @@ def test_predicate_#{constant.downcase} def test_default_xml_save_options if Nokogiri.jruby? - assert_equal(0, (SaveOptions::DEFAULT_XML & SaveOptions::FORMAT)) + assert_equal(0, SaveOptions::DEFAULT_XML & SaveOptions::FORMAT) else - assert_equal(SaveOptions::FORMAT, (SaveOptions::DEFAULT_XML & SaveOptions::FORMAT)) + assert_equal(SaveOptions::FORMAT, SaveOptions::DEFAULT_XML & SaveOptions::FORMAT) end end end diff --git a/test/xml/sax/test_push_parser.rb b/test/xml/sax/test_push_parser.rb index a0dea3b99e..0ea93cfd76 100644 --- a/test/xml/sax/test_push_parser.rb +++ b/test/xml/sax/test_push_parser.rb @@ -88,7 +88,7 @@ def error(msg) end it :test_end_document_called do - parser << (<<~XML) + parser << <<~XML

    Paragraph 1 @@ -100,14 +100,14 @@ def error(msg) end it :test_start_element do - parser << (<<~XML) + parser << <<~XML

    XML assert_equal [["p", [["id", "asdfasdf"]]]], parser.document.start_elements - parser << (<<~XML) + parser << <<~XML Paragraph 1

    @@ -117,14 +117,14 @@ def error(msg) end it :test_start_element_with_namespaces do - parser << (<<~XML) + parser << <<~XML

    XML assert_equal [["p", [["xmlns:foo", "http://foo.example.com/"]]]], parser.document.start_elements - parser << (<<~XML) + parser << <<~XML Paragraph 1

    @@ -134,7 +134,7 @@ def error(msg) end it :test_start_element_ns do - parser << (<<~XML) + parser << <<~XML XML @@ -152,7 +152,7 @@ def error(msg) end it :test_end_element_ns do - parser << (<<~XML) + parser << <<~XML XML @@ -162,11 +162,11 @@ def error(msg) end it :test_chevron_partial_xml do - parser << (<<~XML) + parser << <<~XML

    XML - parser << (<<~XML) + parser << <<~XML Paragraph 1

    @@ -176,7 +176,7 @@ def error(msg) end it :test_chevron do - parser << (<<~XML) + parser << <<~XML

    Paragraph 1 @@ -192,7 +192,7 @@ def error(msg) it :test_recover do parser.options |= Nokogiri::XML::ParseOptions::RECOVER - parser << (<<~XML) + parser << <<~XML

    Foo @@ -248,7 +248,7 @@ def error(msg) it :test_untouched_entities do skip_unless_libxml2("entities are always replaced in pure Java version") - parser << (<<~XML) + parser << <<~XML

    Paragraph 1 & 2 @@ -261,7 +261,7 @@ def error(msg) it :test_replaced_entities do parser.replace_entities = true - parser << (<<~XML) + parser << <<~XML

    Paragraph 1 & 2 diff --git a/test/xml/test_builder.rb b/test/xml/test_builder.rb index 65dffe1ce3..f29bf44aba 100644 --- a/test/xml/test_builder.rb +++ b/test/xml/test_builder.rb @@ -78,6 +78,30 @@ def test_builder_namespace_part_deux assert_equal({ "xmlns:a" => "x", "xmlns:c" => "z" }, namespaces_defined_on(b)) end + def test_builder_namespaces_part_three + doc = Nokogiri::XML::Builder.new do |xml| + xml["dnd"].adventure("xmlns:dnd" => "http://www.w3.org/dungeons#") do + xml["dnd"].party("xmlns:dnd" => "http://www.w3.org/dragons#") do + xml["dnd"].character("xmlns:dnd" => "http://www.w3.org/dungeons#") do + xml["dnd"].name("Nigel", "xmlns:dnd" => "http://www.w3.org/dungeons#") + end + end + end + end.doc + + adventure_node = doc.at_xpath("//*[local-name()='adventure']") + assert_equal("http://www.w3.org/dungeons#", adventure_node.namespace.href) + + party_node = doc.at_xpath("//*[local-name()='party']") + assert_equal("http://www.w3.org/dragons#", party_node.namespace.href) + + character_node = doc.at_xpath("//*[local-name()='character']") + assert_equal("http://www.w3.org/dungeons#", character_node.namespace.href) + + name_node = doc.at_xpath("//*[local-name()='name']") + assert_equal("http://www.w3.org/dungeons#", name_node.namespace.href) + end + def test_builder_with_unlink b = Nokogiri::XML::Builder.new do |xml| xml.foo do diff --git a/test/xml/test_document_encoding.rb b/test/xml/test_document_encoding.rb index aecad95c5b..0127d1a0a0 100644 --- a/test/xml/test_document_encoding.rb +++ b/test/xml/test_document_encoding.rb @@ -124,9 +124,12 @@ class TestDocumentEncoding < Nokogiri::TestCase # no final newline on jruby. descriptive, not prescriptive. expected_length = Nokogiri.jruby? ? xml.bytesize - 1 : xml.bytesize - - assert_equal(Encoding::UTF_8, output.encoding) assert_equal(expected_length, output.bytesize) + + # Note: I dropped the assertion on the encoding of the string return from io.read + # because this behavior has changed back and forth in rubyzip versions 2.4.1 and + # 3.0.0.dev, and it's not relevant to the original bug report which was about an + # exception during writing. end end end diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index 641b0dce5b..138ee7decb 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -109,7 +109,7 @@ def test_node_context_parsing_of_malformed_html_fragment assert_empty(doc.errors) assert_pattern do nodeset => [ - { name: "div", attributes: [{name: "<", value: ""}, { name: "div", value: ""}] }, + { name: "div", attributes: [{ name: "<", value: "" }, { name: "div", value: "" }] }, ] end else @@ -131,7 +131,7 @@ def test_node_context_parsing_of_malformed_html_fragment_with_recover_is_correct assert_empty(doc.errors) assert_pattern do nodeset => [ - { name: "div", attributes: [{name: "<", value: ""}, { name: "div", value: ""}] }, + { name: "div", attributes: [{ name: "<", value: "" }, { name: "div", value: "" }] }, ] end else @@ -461,7 +461,7 @@ def test_description def test_spaceship nodes = xml.xpath("//employee") - assert_equal(-1, (nodes.first <=> nodes.last)) + assert_equal(-1, nodes.first <=> nodes.last) list = [nodes.first, nodes.last].sort assert_equal(nodes.first, list.first) assert_equal(nodes.last, list.last) diff --git a/test/xml/test_node_reparenting.rb b/test/xml/test_node_reparenting.rb index 71270513d3..9e833d2af1 100644 --- a/test/xml/test_node_reparenting.rb +++ b/test/xml/test_node_reparenting.rb @@ -88,13 +88,13 @@ class TestNodeReparenting < Nokogiri::TestCase it "unlinks the Node from its previous position" do @doc.at_xpath(params[:target]).send(method, @other_node) result = @other_doc.at_xpath("/root/a2") - _(result).must_be_nil + assert_nil(result) end it "inserts the Node in the proper position" do @doc.at_xpath(params[:target]).send(method, @other_node) result = @doc.at_xpath("/root/a1/a2") - _(result).wont_be_nil + refute_nil(result) end it "returns the expected value" do @@ -103,9 +103,9 @@ class TestNodeReparenting < Nokogiri::TestCase if !params.key?(:returns_self) assert(method.to_s.end_with?("=")) elsif params[:returns_self] - _(result).must_equal(sendee) + assert_equal(sendee, result) else - _(result).must_equal(@other_node) + assert_equal(@other_node, result) end end end @@ -115,7 +115,7 @@ class TestNodeReparenting < Nokogiri::TestCase it "inserts the fragment roots in the proper position" do @doc.at_xpath(params[:target]).send(method, @fragment_string) result = @doc.xpath("/root/a1/node()").collect(&:name) - _(result).must_equal(params[:children_tags]) + assert_equal(params[:children_tags], result) end it "returns the expected value" do @@ -124,10 +124,10 @@ class TestNodeReparenting < Nokogiri::TestCase if !params.key?(:returns_self) assert(method.to_s.end_with?("=")) elsif params[:returns_self] - _(result).must_equal(sendee) + assert_equal(sendee, result) else - _(result).must_be_kind_of(Nokogiri::XML::NodeSet) - _(result.to_html).must_equal(@fragment_string) + assert_kind_of(Nokogiri::XML::NodeSet, result) + assert_equal(@fragment_string, result.to_html) end end end @@ -135,7 +135,7 @@ class TestNodeReparenting < Nokogiri::TestCase it "inserts the fragment roots in the proper position" do @doc.at_xpath(params[:target]).send(method, @fragment) result = @doc.xpath("/root/a1/node()").collect(&:name) - _(result).must_equal(params[:children_tags]) + assert_equal(params[:children_tags], result) end end describe "passed a document" do @@ -152,7 +152,7 @@ class TestNodeReparenting < Nokogiri::TestCase it "inserts each member of the NodeSet in the proper order" do @doc.at_xpath(params[:target]).send(method, @node_set) result = @doc.xpath("/root/a1/node()").collect(&:name) - _(result).must_equal(params[:children_tags]) + assert_equal(params[:children_tags], result) end end end @@ -163,7 +163,7 @@ class TestNodeReparenting < Nokogiri::TestCase it "merges the Text node with adjacent Text nodes" do @doc.at_xpath("/root/a1").add_child(Nokogiri::XML::Text.new("hello", @doc)) result = @doc.at_xpath("/root/a1/text()").content - _(result).must_equal("First nodehello") + assert_equal("First nodehello", result) end end @@ -171,7 +171,7 @@ class TestNodeReparenting < Nokogiri::TestCase it "merges the Text node with adjacent Text nodes" do @doc.at_xpath("/root/a3/bx").replace(Nokogiri::XML::Text.new("hello", @doc)) result = @doc.at_xpath("/root/a3/text()").content - _(result).must_equal("Third hellonode") + assert_equal("Third hellonode", result) end end end diff --git a/test/xml/test_node_set.rb b/test/xml/test_node_set.rb index 2e2e6c7cde..5655c9e953 100644 --- a/test/xml/test_node_set.rb +++ b/test/xml/test_node_set.rb @@ -129,6 +129,23 @@ class TestNodeSet < Nokogiri::TestCase end end + it "#attr on XML gets attribute from first node" do + doc = Nokogiri::XML("") + children = doc.css("child") + + refute_nil(children.attr("name")) + assert_equal(children.first.attribute("name"), children.attr("name")) + end + + it "#attr on HTML gets attribute from first node" do + # https://github.com/sparklemotion/nokogiri/issues/3487 + doc = Nokogiri::HTML("") + children = doc.css("child") + + refute_nil(children.attr("name")) + assert_equal(children.first.attribute("name"), children.attr("name")) + end + it "#attribute with no args gets attribute from first node" do list.first["foo"] = "bar" assert_equal(list.first.attribute("foo"), list.attribute("foo")) diff --git a/test/xml/test_xpath.rb b/test/xml/test_xpath.rb index 7948cac051..f80dba124b 100644 --- a/test/xml/test_xpath.rb +++ b/test/xml/test_xpath.rb @@ -5,13 +5,6 @@ module Nokogiri module XML class TestXPath < Nokogiri::TestCase - # - # Note that many of these tests vary for jruby because custom xpath functions in JRuby require - # a namespace, and libxml2 (and the original implementation of Nokogiri) do not. - # - # Ideally we should change this to always require a namespace. - # See https://github.com/sparklemotion/nokogiri/issues/2147 - # def setup super