8000 Eliminate most provider match checks to improve speed by dullbananas · Pull Request #9 · jendrikw/clearurls · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Eliminate most provider match checks to improve speed #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 40 additions & 18 deletions src/lib.rs
10000
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#![warn(unused)]
#![warn(unused_extern_crates)]
#![warn(unused_import_braces)]
#![allow(rust_2024_compatibility)]
// Clippy categories
#![warn(clippy::cargo)]
#![warn(clippy::complexity)]
Expand Down Expand Up @@ -79,12 +80,15 @@ extern crate alloc;
extern crate std;

use alloc::borrow::Cow;
use alloc::collections::BTreeMap;
use alloc::string::String;
use alloc::vec::Vec;
use core::fmt::{Display, Formatter};
use core::str::{FromStr, Utf8Error};
use regex::Regex;
use url::{ParseError, Url};

use rules::Rules;
use rules::{Provider, Rules, keys_from_url};

mod deserialize_utils;
mod rules;
Expand All @@ -98,7 +102,8 @@ mod tests;
/// It's recommended to create one per application and reuse it.
#[derive(Debug)]
pub struct UrlCleaner {
rules: Rules,
providers: BTreeMap<String, Vec<Provider>>,
keyless_providers: Vec<Provider>,
strip_referral_marketing: bool,
}

Expand All @@ -117,19 +122,13 @@ impl UrlCleaner {
#[cfg(feature = "std")]
pub fn from_rules_file<R: std::io::Read>(reader: R) -> Result<Self, Error> {
let buf = std::io::BufReader::new(reader);
Ok(Self {
rules: serde_json::from_reader(buf)?,
strip_referral_marketing: false,
})
Ok(Self::from_rules(serde_json::from_reader(buf)?))
}

/// # Errors
/// See [`Error`]
pub fn from_rules_str(rules: &str) -> Result<Self, Error> {
Ok(Self {
rules: serde_json::from_str(rules)?,
strip_referral_marketing: false,
})
Ok(Self::from_rules(serde_json::from_str(rules)?))
}

/// Construct using the JSON embedded in this library.
Expand Down Expand Up @@ -171,7 +170,7 @@ impl UrlCleaner {
return Ok(Cow::Borrowed(url));
}
let mut result = Url::from_str(url)?;
for p in &self.rules.providers {
for p in self.possible_matches(url) {
if p.match_url(result.as_str()) {
result = p.remove_fields_from_url(&result, self.strip_referral_marketing)?;
}
Expand All @@ -196,14 +195,14 @@ impl UrlCleaner {
if url.scheme().starts_with("data") {
return Ok(Cow::Borrowed(url));
}
let mut url = Cow::Borrowed(url);
for p in &self.rules.providers {
if p.match_url(url.as_str()) {
url = Cow::Owned(p.remove_fields_from_url(&url, self.strip_referral_marketing)?);
let mut result = Cow::Borrowed(url);
for p in self.possible_matches(url.as_str()) {
if p.match_url(result.as_str()) {
result = Cow::Owned(p.remove_fields_from_url(&result, self.strip_referral_marketing)?);
}
}

Ok(url)
Ok(result)
}

/// Clean all URLs in a text.
Expand All @@ -218,7 +217,7 @@ impl UrlCleaner {
/// Text outside of URLs is left unchanged.
///
/// # Errors
/// Alls errors encountered are returned in a [`Vec`][alloc::vec::Vec].
/// Alls errors encountered are returned in a [`Vec`].
#[cfg(feature = "linkify")]
pub fn clear_text<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, alloc::vec::Vec<Error>> {
self.clear_text_with_linkfinder(s, &linkify::LinkFinder::new())
Expand All @@ -237,7 +236,7 @@ impl UrlCleaner {
/// Text outside of URLs is left unchanged.
///
/// # Errors
/// Alls errors encountered are returned in a [`Vec`][alloc::vec::Vec].
/// Alls errors encountered are returned in a [`Vec`].
#[cfg(feature = "linkify")]
pub fn clear_text_with_linkfinder<'a>(
&self,
Expand Down Expand Up @@ -338,6 +337,29 @@ impl UrlCleaner {
Err(result)
}
}

fn possible_matches<'a>(&'a self, url: &'a str) -> impl Iterator<Item = &'a Provider> + 'a {
keys_from_url(url)
.flat_map(|key| self.providers.get(key).map(Vec::as_slice).unwrap_or_default())
.chain(&self.keyless_providers)
}

fn from_rules(rules: Rules) -> Self {
let mut providers = BTreeMap::new();
let mut keyless_providers = Vec::new();
for provider in rules.providers {
if let Some(key) = provider.get_key() {
providers.entry(key).or_insert_with(Vec::new).push(provider);
} else {
keyless_providers.push(provider);
}
}
Self {
providers,
keyless_providers,
strip_referral_marketing: false,
}
}
}

/// Various errors that can happen while cleaning a URL
Expand Down
34 changes: 34 additions & 0 deletions src/rules.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use alloc::borrow::Cow;
use alloc::borrow::ToOwned;
use alloc::str::FromStr;
use alloc::string::String;
use alloc::vec::Vec;
Expand Down Expand Up @@ -76,6 +77,18 @@ impl Provider {
self.url_pattern.is_match(url) && !self.match_exception(url)
}

/// If this returns `Some(key)`, then `provider.match_url(url)` always returns false if `keys_from_url(url)` contains `key`. This is used to improve performance by eliminating most calls to `match_url`.
pub(crate) fn get_key(&self) -> Option<String> {
self
.url_pattern
.as_str()
.replace(r"\/", "/")
.replace(r"\-", "-")
.strip_prefix(r"^https?://(?:[a-z0-9-]+\.)*?")
.and_then(|s| key_iter(s, r"\.").next())
.map(ToOwned::to_owned)
}

fn match_exception(&self, url: &str) -> bool {
url == "javascript:void(0)" || self.exceptions.is_match(url)
}
Expand Down Expand Up @@ -103,6 +116,27 @@ impl Provider {
}
}

/// See `Provider::key`
pub(crate) fn keys_from_url(url: &str) -> impl Iterator<Item = &str> {
url
.strip_prefix("http")
.map(|s| s.strip_prefix('s').unwrap_or(s))
.and_then(|s| s.strip_prefix("://"))
.into_iter()
.flat_map(|s| key_iter(s, "."))
}

fn key_iter<'a>(s: &'a str, delimiter: &'static str) -> impl Iterator<Item = &'a str> + 'a {
s
.split_inclusive(delimiter)
.filter_map(move |s| s.strip_suffix(delimiter))
.take_while(|&s| !s.is_empty() && s.chars().all(is_allowed_domain_char))
}

const fn is_allowed_domain_char(c: char) -> bool {
c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-'
}

fn serialize_params<'a>(
mut params: impl Iterator<Item = &'a (Cow<'a, str>, Cow<'a, str>)>,
) -> Option<String> {
Expand Down
70 changes: 51 additions & 19 deletions src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ fn test_raw_rules_produce_invalid_url() {
fn test_from_read_vec() {
let data = br#"{"providers":{"example":{"urlPattern":"","rules":["foo"]}}}"#;
let c = UrlCleaner::from_rules_file(&data[..]).unwrap();
assert_eq!(c.rules.providers.len(), 1);
assert_eq!(c.rules.providers[0].rules.len(), 1);
assert_eq!(c.rules.providers[0].rules[0].as_str(), "foo");
assert_eq!(c.keyless_providers.len(), 1);
assert_eq!(c.keyless_providers[0].rules.len(), 1);
assert_eq!(c.keyless_providers[0].rules[0].as_str(), "foo");
}

#[test]
Expand Down Expand Up @@ -174,9 +174,9 @@ fn test_from_read_file() {
.unwrap();
file.seek(SeekFrom::Start(0)).unwrap();
let c = UrlCleaner::from_rules_file(&file).unwrap();
assert_eq!(c.rules.providers.len(), 1);
assert_eq!(c.rules.providers[0].rules.len(), 1);
assert_eq!(c.rules.providers[0].rules[0].as_str(), "foo");
assert_eq!(c.keyless_providers.len(), 1);
assert_eq!(c.keyless_providers[0].rules.len(), 1);
assert_eq!(c.keyless_providers[0].rules[0].as_str(), "foo");
}

#[test]
Expand All @@ -187,9 +187,9 @@ fn test_from_path() {
.unwrap();
file.seek(SeekFrom::Start(0)).unwrap();
let c = UrlCleaner::from_rules_path(file.path()).unwrap();
assert_eq!(c.rules.providers.len(), 1);
assert_eq!(c.rules.providers[0].rules.len(), 1);
assert_eq!(c.rules.providers[0].rules[0].as_str(), "foo");
assert_eq!(c.keyless_providers.len(), 1);
assert_eq!(c.keyless_providers[0].rules.len(), 1);
assert_eq!(c.keyless_providers[0].rules[0].as_str(), "foo");
}

#[test]
Expand All @@ -207,16 +207,15 @@ fn test_from_invalid_path() {
#[test]
fn test_remove_fields_from_url_errors() {
let provider = UrlCleaner {
rules: Rules {
providers: vec![Provider {
url_pattern: Regex::new(".*").unwrap(),
rules: vec![],
raw_rules: vec![],
referral_marketing: vec![],
exceptions: RegexSet::default(),
redirections: vec![],
}],
},
providers: BTreeMap::new(),
keyless_providers: vec![Provider {
url_pattern: Regex::new(".*").unwrap(),
rules: vec![],
raw_rules: vec![],
referral_marketing: vec![],
exceptions: RegexSet::default(),
redirections: vec![],
}],
strip_referral_marketing: false,
};
let err = provider.clear_single_url_str("//example.com").unwrap_err();
Expand All @@ -231,6 +230,39 @@ fn test_remove_fields_from_url_errors() {
);
}

#[test]
fn test_key_optimization_effectiveness() {
// If an assertion here fails, then maybe `Provider::get_key` doesn't return a unique value often enough for a new dataset
let cleaner = UrlCleaner::from_embedded_rules().unwrap();
let num_with_keys = cleaner.providers.values().map(Vec::len).sum::<usize>();
let num_without_keys = cleaner.keyless_providers.len();
let total = num_with_keys + num_without_keys;
let (most_common_key, highest_count) = cleaner
.providers
.iter()
.map(|(k, v)| (k, v.len()))
.max_by_key(|&(_, v)| v)
.unwrap();
let num_with_dup_keys = cleaner
.providers
.values()
.filter(|v| v.len() > 1)
.map(Vec::len)
.sum::<usize>();
assert!(
num_with_keys > num_without_keys * 4,
"very high number of providers have no keys: {num_without_keys}/{total}"
);
assert!(
highest_count < 5,
"key {most_common_key:?} is used by very high number of providers: {highest_count}"
);
assert!(
num_with_keys > num_with_dup_keys * 20,
"very high number of keys are duplicates: {num_with_dup_keys}/{num_with_keys}"
);
}

#[cfg(feature = "std")]
fn error_eq<T: std::error::Error + PartialEq + 'static>(
x: &T,
Expand Down
Loading
0