Yelp · coyotemarin · Aug 5, 2013 · Aug 3, 2013 · Aug 5, 2013 · Aug 5, 2013
diff --git a/mrjob/parse.py b/mrjob/parse.py
@@ -79,18 +79,31 @@ def parse_s3_uri(uri):
 
 
 @wraps(urlparse_buggy)
-def urlparse(*args, **kwargs):
-    """A wrapper for :py:func:`urlparse.urlparse` that handles buckets in S3
-    URIs correctly. (:py:func:`~urlparse.urlparse` does this correctly sometime
-    after 2.6.1; this is just a patch for older Python versions.)"""
-    components = urlparse_buggy(*args, **kwargs)
-    if components.netloc == '' and components.path.startswith('//'):
-        m = NETLOC_RE.match(components.path)
-        return ParseResult(components.scheme, m.group(1), m.group(2),
-                           components.params, components.query,
-                           components.fragment)
-    else:
-        return components
+def urlparse(urlstring, scheme='', allow_fragments=True, *args, **kwargs):
+    """A wrapper for :py:func:`urlparse.urlparse` with the following
+    differences:
+
+    * Handles buckets in S3 URIs correctly. (:py:func:`~urlparse.urlparse`
+      does this correctly sometime after 2.6.1; this is just a patch for older
+      Python versions.)
+    * Splits the fragment correctly in all URIs, not just Web-related ones.
+      This behavior was fixed in the Python 2.7.4 standard library but we have
+      to back-port it for previous versions.
+    """
+    # we're probably going to mess with at least one of these values and
+    # re-pack the whole thing before we return it.
+    # NB: urlparse_buggy()'s second argument changes names from
+    # 'default_scheme' to 'scheme' in Python 2.6, so urlparse_buggy() should
+    # be called with positional arguments.
+    (scheme, netloc, path, params, query, fragment) = (
+        urlparse_buggy(urlstring, scheme, allow_fragments, *args, **kwargs))
+    if netloc == '
8000
;' and path.startswith('//'):
+        m = NETLOC_RE.match(path)
+        netloc = m.group(1)
+        path = m.group(1)
+    if allow_fragments and '#' in path and not fragment:
+        path, fragment = path.split('#', 1)
+    return ParseResult(scheme, netloc, path, params, query, fragment)
 
 
 ### OPTION PARSING ###

diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -540,7 +540,7 @@ def test_urlparse(self):
         self.assertEqual(urlparse('s3://bucket/path'),
                          ('s3', 'bucket', '/path', '', '', ''))
         self.assertEqual(urlparse('s3://bucket/path#customname'),
-                         ('s3', 'bucket', '/path#customname', '', '', ''))
+                         ('s3', 'bucket', '/path', '', '', 'customname'))
         self.assertEqual(urlparse('s3://bucket'),
                          ('s3', 'bucket', '', '', '', ''))
         self.assertEqual(urlparse('s3://bucket/'),