8000 Add proxy use per site by uBaze · Pull Request #112 · cvandeplas/pystemon · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add proxy use per site #112

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your accoun 8000 t

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pystemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def load_config(config):

for i in range(config.threads):
name = "[ThreadPasties][{}][{}]".format(site.name, i+1)
user_agent = PystemonUA(name, config.proxies_list,
user_agent = PystemonUA(name, config.proxies_list, proxify_all=config.proxify_all,
user_agents_list = config.user_agents_list,
throttler=throttler, ip_addr=config.ip_addr)
t = ThreadPasties(user_agent, queue_name=site.name, queue=site.queue)
Expand All @@ -112,7 +112,7 @@ def load_config(config):

# Compressed is used to guess the filename, so it's mandatory to pass it along
name = "[PastieSite][{}]".format(site.name)
site_ua=PystemonUA(name, config.proxies_list,
site_ua=PystemonUA(name, config.proxies_list, proxify_all=config.proxify_all,
user_agents_list = config.user_agents_list,
throttler = throttler, ip_addr = config.ip_addr)
t = PastieSite(site.name, site.download_url, site.archive_url, site.archive_regex,
Expand All @@ -121,6 +121,7 @@ def load_config(config):
site_update_min = site.update_min,
site_update_max = site.update_max,
site_pastie_classname = site.pastie_classname,
site_use_proxy = site.use_proxy,
site_save_dir = config.save_dir,
site_archive_dir = config.archive_dir,
archive_compress = config.compress,
Expand Down
24 changes: 19 additions & 5 deletions pystemon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(self, name, config):
self.metadata_url = config.get('metadata-url')
self.update_min = config.get('update-min', 10)
self.update_max = config.get('update-max', 30)
self.use_proxy = config.get('use-proxy', False)
self.pastie_classname = config.get('pastie-classname')

@property
Expand Down Expand Up @@ -159,6 +160,11 @@ def storage_engines(self):
with self.lock:
return self._storage_engines

@property
def proxify_all(self):
with self.lock:
return self._proxify_all

@property
def proxies_list(self):
with self.lock:
Expand Down Expand Up @@ -208,6 +214,7 @@ def reload(self):
self._save_dir = config.get('save_dir')
self._archive_dir = config.get('archive_dir')
self._compress = config.get('compress')
self._proxify_all = config.get('proxify_all')
self._proxies_list = config.get('proxies_list')
self._re_module = config.get('re_module')
self._patterns = config.get('patterns')
Expand Down Expand Up @@ -238,11 +245,11 @@ def _reload(self):
logger.debug("parsing yaml configuration from file '{}'".format(self._configfile))
config = {}
yamlconfig = self._yamlconfig
try:
if yamlconfig['proxy']['random']:
config['proxies_list'] = ProxyList(yamlconfig['proxy']['file'])
except KeyError:
pass

proxyconfig = yamlconfig.get('proxy', {})
config['proxify_all'] = proxyconfig.get('random', False)
if config['proxify_all']:
config['proxies_list'] = ProxyList(yamlconfig['proxy']['file'])

config['save_thread'] = yamlconfig.get('save-thread', False)

Expand Down Expand Up @@ -278,6 +285,13 @@ def _reload(self):

config['sites'] = self._load_sites(yamlconfig)

if not "proxies_list" in config:
for site in config['sites']:
if site.use_proxy:
logger.debug("site {0} found with use-proxy directive, getting proxies list".format(site.name))
config['proxies_list'] = ProxyList(yamlconfig['proxy']['file'])
break

if not self.debug and 'logging-level' in yamlconfig:
if yamlconfig['logging-level'] in ['NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']:
logger.setLevel(logging.getLevelName(yamlconfig['logging-level']))
Expand Down
5 changes: 3 additions & 2 deletions pystemon/pastie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __init__(self, site, pastie_id):
self.md5 = None
self.url = self.site.download_url.format(id=self.id)
self.public_url = self.site.public_url.format(id=self.id)
self.use_proxy = self.site.use_proxy
self.metadata_url = None
if self.site.metadata_url is not None:
self.metadata_url = self.site.metadata_url.format(id=self.id)
Expand All @@ -92,11 +93,11 @@ def download_url(self, url, **kwargs):

def fetch_pastie(self):
if self.metadata_url is not None:
response = self.download_url(self.metadata_url)
response = self.download_url(self.metadata_url, use_proxy=self.use_proxy)
if response is not None:
response = response.content
self.pastie_metadata = response
response = self.download_url(self.url)
response = self.download_url(self.url, use_proxy=self.use_proxy)
if response is not None:
response = response.content
self.pastie_content = response
Expand Down
3 changes: 2 additions & 1 deletion pystemon/pastiesite.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(self, name, download_url, archive_url, archive_regex, **kwargs):
if kwargs['site_metadata_url'] is not None:
self.metadata_url = kwargs['site_metadata_url']

self.use_proxy = kwargs.get('site_use_proxy', False)
self.archive_compress = kwargs.get('archive_compress', False)
self.update_min = kwargs['site_update_min']
self.update_max = kwargs['site_update_max']
Expand Down Expand Up @@ -167,7 +168,7 @@ def get_last_pasties(self):
# reset the pasties list
pasties = []
# populate queue with data
response = self.user_agent.download_url(self.archive_url)
response = self.user_agent.download_url(self.archive_url, use_proxy=self.use_proxy)
if not response:
logger.warning("Failed to download page {url}".format(url=self.archive_url))
return False
Expand Down
11 changes: 6 additions & 5 deletions pystemon/ua.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ def get_bound_session(self):
self.name, self.ip_addr, str(e)))
return session

def __init__(self, name, proxies_list, user_agents_list = [],
def __init__(self, name, proxies_list, proxify_all=False, user_agents_list = [],
retries_client=5, retries_server=100,
throttler=None, ip_addr=None,
connection_timeout=3.05, read_timeout=10):
self.name = "user-agent"+name
self.user_agents_list = user_agents_list
self.proxify_all = proxify_all
self.proxies_list = proxies_list
self.retries_client = retries_client
self.retries_server = retries_server
Expand All @@ -86,7 +87,7 @@ def get_random_user_agent(self):
def __parse_http__(self, url, session, random_proxy):
logger.debug("{}: Parsing response for url '{}'".format(self.name, url))
try:
response = session.get(url, stream=True, timeout=(self.connection_timeout, self.read_timeout))
response = session.get(url, stream=True, verify=False, timeout=(self.connection_timeout, self.read_timeout))
response.raise_for_status()
res = {'response': response}
except HTTPError as e:
Expand Down Expand Up @@ -174,7 +175,7 @@ def __download_url__(self, url, session, random_proxy):
# do NOT try to download the url again here, as we might end in enless loop
return res

def download_url(self, url, data=None, cookie=None, wait=0):
def download_url(self, url, use_proxy=False, data=None, cookie=None, wait=0):
# let's not recurse where exceptions can raise exceptions can raise exceptions can...
response = None
loop_client = 0
Expand All @@ -189,10 +190,10 @@ def download_url(self, url, data=None, cookie=None, wait=0):
logger.debug("{}: download_url: permission to download granted".format(self.name))
session = self.get_bound_session()
random_proxy = None
if self.proxies_list:
if self.proxies_list and (self.proxify_all or use_proxy):
random_proxy = self.proxies_list.get_random_proxy()
if random_proxy:
session.proxies = {'http': random_proxy}
session.proxies = {'http': random_proxy, 'https': random_proxy}
user_agent = self.get_random_user_agent()
session.headers.upda 4431 te({'User-Agent': user_agent, 'Accept-Charset': 'utf-8'})
if cookie:
Expand Down
0