From 0a8e55454d24f0006eb96d387f8798884f14b227 Mon Sep 17 00:00:00 2001
From: Stephen Johnson <steve@steveasleep.com>
Date: Sat, 3 Aug 2013 12:48:51 -0700
Subject: [PATCH 1/4] Normalize urlparse behavior for s3, s3n, hdfs, mapreduce
 url schemes for all Python versions

---
 mrjob/parse.py      | 16 +++++++++++++++-
 tests/test_parse.py |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/mrjob/parse.py b/mrjob/parse.py
index ebb156ed0..13e2b4d4b 100644
--- a/mrjob/parse.py
+++ b/mrjob/parse.py
@@ -15,8 +15,10 @@
 from functools import wraps
 import logging
 import re
+import urlparse
 from urlparse import ParseResult
-from urlparse import urlparse as urlparse_buggy
+
+urlparse_buggy = urlparse.urlparse
 
 try:
     from cStringIO import StringIO
@@ -27,6 +29,18 @@
 from mrjob.compat import uses_020_counters
 
 
+### danger: monkey patch urlparse on python < 2.7.4 ###
+
+if hasattr(urlparse, 'uses_fragment'):
+    # Python 2.7.4 changes the behavior of urlparse to parse the fragment out
+    # of any URL scheme, which breaks our tests and possibly our code. The
+    # below patch will normalize the behavior for the listed URL schemes.
+    #
+    # http://bugs.python.org/issue9374
+    # http://hg.python.org/cpython/rev/79e6ff3d9afd
+    urlparse.uses_fragment.extend(['s3', 's3n', 'hdfs', 'mapreduce'])
+
+
 # match the filename of a hadoop streaming jar
 HADOOP_STREAMING_JAR_RE = re.compile(r'^hadoop.*streaming.*\.jar$')
 
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 5a98a4160..ac7a985c5 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -540,7 +540,7 @@ def test_urlparse(self):
         self.assertEqual(urlparse('s3://bucket/path'),
                          ('s3', 'bucket', '/path', '', '', ''))
         self.assertEqual(urlparse('s3://bucket/path#customname'),
-                         ('s3', 'bucket', '/path#customname', '', '', ''))
+                         ('s3', 'bucket', '/path', '', '', 'customname'))
         self.assertEqual(urlparse('s3://bucket'),
                          ('s3', 'bucket', '', '', '', ''))
         self.assertEqual(urlparse('s3://bucket/'),

From 90dce482f11176d6bebc239a623449240df468f7 Mon Sep 17 00:00:00 2001
From: Stephen Johnson <steve@steveasleep.com>
Date: Sun, 4 Aug 2013 17:26:20 -0700
Subject: [PATCH 2/4] Better solution to py2.7.4 urlparse behavior fix

---
 mrjob/parse.py | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/mrjob/parse.py b/mrjob/parse.py
index 13e2b4d4b..46743b5bd 100644
--- a/mrjob/parse.py
+++ b/mrjob/parse.py
@@ -15,10 +15,8 @@
 from functools import wraps
 import logging
 import re
-import urlparse
 from urlparse import ParseResult
-
-urlparse_buggy = urlparse.urlparse
+from urlparse import urlparse as urlparse_buggy
 
 try:
     from cStringIO import StringIO
@@ -29,18 +27,6 @@
 from mrjob.compat import uses_020_counters
 
 
-### danger: monkey patch urlparse on python < 2.7.4 ###
-
-if hasattr(urlparse, 'uses_fragment'):
-    # Python 2.7.4 changes the behavior of urlparse to parse the fragment out
-    # of any URL scheme, which breaks our tests and possibly our code. The
-    # below patch will normalize the behavior for the listed URL schemes.
-    #
-    # http://bugs.python.org/issue9374
-    # http://hg.python.org/cpython/rev/79e6ff3d9afd
-    urlparse.uses_fragment.extend(['s3', 's3n', 'hdfs', 'mapreduce'])
-
-
 # match the filename of a hadoop streaming jar
 HADOOP_STREAMING_JAR_RE = re.compile(r'^hadoop.*streaming.*\.jar$')
 
@@ -94,17 +80,27 @@ def parse_s3_uri(uri):
 
 @wraps(urlparse_buggy)
 def urlparse(*args, **kwargs):
-    """A wrapper for :py:func:`urlparse.urlparse` that handles buckets in S3
-    URIs correctly. (:py:func:`~urlparse.urlparse` does this correctly sometime
-    after 2.6.1; this is just a patch for older Python versions.)"""
-    components = urlparse_buggy(*args, **kwargs)
-    if components.netloc == '' and components.path.startswith('//'):
-        m = NETLOC_RE.match(components.path)
-        return ParseResult(components.scheme, m.group(1), m.group(2),
-                           components.params, components.query,
-                           components.fragment)
-    else:
-        return components
+    """A wrapper for :py:func:`urlparse.urlparse` with the following
+    differences:
+
+    * Handles buckets in S3 URIs correctly. (:py:func:`~urlparse.urlparse`
+      does this correctly sometime after 2.6.1; this is just a patch for older
+      Python versions.)
+    * Splits the fragment correctly in all URIs, not just Web-related ones.
+      This behavior was fixed in the Python 2.7.4 standard library but we have
+      to back-port it for previous versions.
+    """
+    # we're probably going to mess with at least one of these values and
+    # re-pack the whole thing before we return it
+    (scheme, netloc, path, params, query, fragment) = (
+        urlparse_buggy(*args, **kwargs))
+    if netloc == '' and path.startswith('//'):
+        m = NETLOC_RE.match(path)
+        netloc = m.group(1)
+        path = m.group(1)
+    if '#' in path and not fragment:
+        path, fragment = path.split('#', 1)
+    return ParseResult(scheme, netloc, path, params, query, fragment)
 
 
 ### OPTION PARSING ###

From 9062519d5e76f0ba00b05f8277a0c79de0efb44a Mon Sep 17 00:00:00 2001
From: Stephen Johnson <steve@steveasleep.com>
Date: Sun, 4 Aug 2013 21:54:02 -0700
Subject: [PATCH 3/4] Support allow_fragments in mrjob.parse.urlparse()

---
 mrjob/parse.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mrjob/parse.py b/mrjob/parse.py
index 46743b5bd..2f7be60c3 100644
--- a/mrjob/parse.py
+++ b/mrjob/parse.py
@@ -78,8 +78,7 @@ def parse_s3_uri(uri):
     return components.netloc, components.path[1:]
 
 
-@wraps(urlparse_buggy)
-def urlparse(*args, **kwargs):
+def urlparse(urlstring, scheme='', allow_fragments=True):
     """A wrapper for :py:func:`urlparse.urlparse` with the following
     differences:
 
@@ -91,14 +90,17 @@ def urlparse(*args, **kwargs):
       to back-port it for previous versions.
     """
     # we're probably going to mess with at least one of these values and
-    # re-pack the whole thing before we return it
+    # re-pack the whole thing before we return it.
+    # NB: urlparse_buggy()'s second argument changes names from
+    # 'default_scheme' to 'scheme' in Python 2.6, so urlparse_buggy() should
+    # be called with positional arguments.
     (scheme, netloc, path, params, query, fragment) = (
-        urlparse_buggy(*args, **kwargs))
+        urlparse_buggy(urlstring, scheme, allow_fragments))
     if netloc == '' and path.startswith('//'):
         m = NETLOC_RE.match(path)
         netloc = m.group(1)
         path = m.group(1)
-    if '#' in path and not fragment:
+    if allow_fragments and '#' in path and not fragment:
         path, fragment = path.split('#', 1)
     return ParseResult(scheme, netloc, path, params, query, fragment)
 

From e2318c7ac17be94ffa41b68b248d102d25b0107e Mon Sep 17 00:00:00 2001
From: Stephen Johnson <steve@steveasleep.com>
Date: Sun, 4 Aug 2013 21:55:29 -0700
Subject: [PATCH 4/4] Reinstate @wraps, future proof against extremely unlikely
 func sig changes

---
 mrjob/parse.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mrjob/parse.py b/mrjob/parse.py
index 2f7be60c3..d050e3e3f 100644
--- a/mrjob/parse.py
+++ b/mrjob/parse.py
@@ -78,7 +78,8 @@ def parse_s3_uri(uri):
     return components.netloc, components.path[1:]
 
 
-def urlparse(urlstring, scheme='', allow_fragments=True):
+@wraps(urlparse_buggy)
+def urlparse(urlstring, scheme='', allow_fragments=True, *args, **kwargs):
     """A wrapper for :py:func:`urlparse.urlparse` with the following
     differences:
 
@@ -95,7 +96,7 @@ def urlparse(urlstring, scheme='', allow_fragments=True):
     # 'default_scheme' to 'scheme' in Python 2.6, so urlparse_buggy() should
     # be called with positional arguments.
     (scheme, netloc, path, params, query, fragment) = (
-        urlparse_buggy(urlstring, scheme, allow_fragments))
+        urlparse_buggy(urlstring, scheme, allow_fragments, *args, **kwargs))
     if netloc == '' and path.startswith('//'):
         m = NETLOC_RE.match(path)
         netloc = m.group(1)