From 0a8e55454d24f0006eb96d387f8798884f14b227 Mon Sep 17 00:00:00 2001 From: Stephen Johnson Date: Sat, 3 Aug 2013 12:48:51 -0700 Subject: [PATCH 1/4] Normalize urlparse behavior for s3, s3n, hdfs, mapreduce url schemes for all Python versions --- mrjob/parse.py | 16 +++++++++++++++- tests/test_parse.py | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/mrjob/parse.py b/mrjob/parse.py index ebb156ed0..13e2b4d4b 100644 --- a/mrjob/parse.py +++ b/mrjob/parse.py @@ -15,8 +15,10 @@ from functools import wraps import logging import re +import urlparse from urlparse import ParseResult -from urlparse import urlparse as urlparse_buggy + +urlparse_buggy = urlparse.urlparse try: from cStringIO import StringIO @@ -27,6 +29,18 @@ from mrjob.compat import uses_020_counters +### danger: monkey patch urlparse on python < 2.7.4 ### + +if hasattr(urlparse, 'uses_fragment'): + # Python 2.7.4 changes the behavior of urlparse to parse the fragment out + # of any URL scheme, which breaks our tests and possibly our code. The + # below patch will normalize the behavior for the listed URL schemes. + # + # http://bugs.python.org/issue9374 + # http://hg.python.org/cpython/rev/79e6ff3d9afd + urlparse.uses_fragment.extend(['s3', 's3n', 'hdfs', 'mapreduce']) + + # match the filename of a hadoop streaming jar HADOOP_STREAMING_JAR_RE = re.compile(r'^hadoop.*streaming.*\.jar$') diff --git a/tests/test_parse.py b/tests/test_parse.py index 5a98a4160..ac7a985c5 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -540,7 +540,7 @@ def test_urlparse(self): self.assertEqual(urlparse('s3://bucket/path'), ('s3', 'bucket', '/path', '', '', '')) self.assertEqual(urlparse('s3://bucket/path#customname'), - ('s3', 'bucket', '/path#customname', '', '', '')) + ('s3', 'bucket', '/path', '', '', 'customname')) self.assertEqual(urlparse('s3://bucket'), ('s3', 'bucket', '', '', '', '')) self.assertEqual(urlparse('s3://bucket/'), From 90dce482f11176d6bebc239a623449240df468f7 Mon Sep 17 00:00:00 2001 From: Stephen Johnson Date: Sun, 4 Aug 2013 17:26:20 -0700 Subject: [PATCH 2/4] Better solution to py2.7.4 urlparse behavior fix --- mrjob/parse.py | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/mrjob/parse.py b/mrjob/parse.py index 13e2b4d4b..46743b5bd 100644 --- a/mrjob/parse.py +++ b/mrjob/parse.py @@ -15,10 +15,8 @@ from functools import wraps import logging import re -import urlparse from urlparse import ParseResult - -urlparse_buggy = urlparse.urlparse +from urlparse import urlparse as urlparse_buggy try: from cStringIO import StringIO @@ -29,18 +27,6 @@ from mrjob.compat import uses_020_counters -### danger: monkey patch urlparse on python < 2.7.4 ### - -if hasattr(urlparse, 'uses_fragment'): - # Python 2.7.4 changes the behavior of urlparse to parse the fragment out - # of any URL scheme, which breaks our tests and possibly our code. The - # below patch will normalize the behavior for the listed URL schemes. - # - # http://bugs.python.org/issue9374 - # http://hg.python.org/cpython/rev/79e6ff3d9afd - urlparse.uses_fragment.extend(['s3', 's3n', 'hdfs', 'mapreduce']) - - # match the filename of a hadoop streaming jar HADOOP_STREAMING_JAR_RE = re.compile(r'^hadoop.*streaming.*\.jar$') @@ -94,17 +80,27 @@ def parse_s3_uri(uri): @wraps(urlparse_buggy) def urlparse(*args, **kwargs): - """A wrapper for :py:func:`urlparse.urlparse` that handles buckets in S3 - URIs correctly. (:py:func:`~urlparse.urlparse` does this correctly sometime - after 2.6.1; this is just a patch for older Python versions.)""" - components = urlparse_buggy(*args, **kwargs) - if components.netloc == '' and components.path.startswith('//'): - m = NETLOC_RE.match(components.path) - return ParseResult(components.scheme, m.group(1), m.group(2), - components.params, components.query, - components.fragment) - else: - return components + """A wrapper for :py:func:`urlparse.urlparse` with the following + differences: + + * Handles buckets in S3 URIs correctly. (:py:func:`~urlparse.urlparse` + does this correctly sometime after 2.6.1; this is just a patch for older + Python versions.) + * Splits the fragment correctly in all URIs, not just Web-related ones. + This behavior was fixed in the Python 2.7.4 standard library but we have + to back-port it for previous versions. + """ + # we're probably going to mess with at least one of these values and + # re-pack the whole thing before we return it + (scheme, netloc, path, params, query, fragment) = ( + urlparse_buggy(*args, **kwargs)) + if netloc == '' and path.startswith('//'): + m = NETLOC_RE.match(path) + netloc = m.group(1) + path = m.group(1) + if '#' in path and not fragment: + path, fragment = path.split('#', 1) + return ParseResult(scheme, netloc, path, params, query, fragment) ### OPTION PARSING ### From 9062519d5e76f0ba00b05f8277a0c79de0efb44a Mon Sep 17 00:00:00 2001 From: Stephen Johnson Date: Sun, 4 Aug 2013 21:54:02 -0700 Subject: [PATCH 3/4] Support allow_fragments in mrjob.parse.urlparse() --- mrjob/parse.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mrjob/parse.py b/mrjob/parse.py index 46743b5bd..2f7be60c3 100644 --- a/mrjob/parse.py +++ b/mrjob/parse.py @@ -78,8 +78,7 @@ def parse_s3_uri(uri): return components.netloc, components.path[1:] -@wraps(urlparse_buggy) -def urlparse(*args, **kwargs): +def urlparse(urlstring, scheme='', allow_fragments=True): """A wrapper for :py:func:`urlparse.urlparse` with the following differences: @@ -91,14 +90,17 @@ def urlparse(*args, **kwargs): to back-port it for previous versions. """ # we're probably going to mess with at least one of these values and - # re-pack the whole thing before we return it + # re-pack the whole thing before we return it. + # NB: urlparse_buggy()'s second argument changes names from + # 'default_scheme' to 'scheme' in Python 2.6, so urlparse_buggy() should + # be called with positional arguments. (scheme, netloc, path, params, query, fragment) = ( - urlparse_buggy(*args, **kwargs)) + urlparse_buggy(urlstring, scheme, allow_fragments)) if netloc == '' and path.startswith('//'): m = NETLOC_RE.match(path) netloc = m.group(1) path = m.group(1) - if '#' in path and not fragment: + if allow_fragments and '#' in path and not fragment: path, fragment = path.split('#', 1) return ParseResult(scheme, netloc, path, params, query, fragment) From e2318c7ac17be94ffa41b68b248d102d25b0107e Mon Sep 17 00:00:00 2001 From: Stephen Johnson Date: Sun, 4 Aug 2013 21:55:29 -0700 Subject: [PATCH 4/4] Reinstate @wraps, future proof against extremely unlikely func sig changes --- mrjob/parse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mrjob/parse.py b/mrjob/parse.py index 2f7be60c3..d050e3e3f 100644 --- a/mrjob/parse.py +++ b/mrjob/parse.py @@ -78,7 +78,8 @@ def parse_s3_uri(uri): return components.netloc, components.path[1:] -def urlparse(urlstring, scheme='', allow_fragments=True): +@wraps(urlparse_buggy) +def urlparse(urlstring, scheme='', allow_fragments=True, *args, **kwargs): """A wrapper for :py:func:`urlparse.urlparse` with the following differences: @@ -95,7 +96,7 @@ def urlparse(urlstring, scheme='', allow_fragments=True): # 'default_scheme' to 'scheme' in Python 2.6, so urlparse_buggy() should # be called with positional arguments. (scheme, netloc, path, params, query, fragment) = ( - urlparse_buggy(urlstring, scheme, allow_fragments)) + urlparse_buggy(urlstring, scheme, allow_fragments, *args, **kwargs)) if netloc == '' and path.startswith('//'): m = NETLOC_RE.match(path) netloc = m.group(1)