Source code for synapse.lib.urlhelp

import urllib.parse

import regex
import synapse.exc as s_exc

[docs]def chopurl(url): ''' A sane "stand alone" url parser. Example: info = chopurl(url) ''' ret = {} if url.find('://') == -1: raise s_exc.BadUrl(':// not found in [{}]!'.format(url)) scheme, remain = url.split('://', 1) ret['scheme'] = scheme.lower() # carve query params from the end if remain.find('?') != -1: query = {} remain, queryrem = remain.split('?', 1) for qkey in queryrem.split('&'): qval = None if qkey.find('=') != -1: qkey, qval = qkey.split('=', 1) query[qkey] = qval ret['query'] = query pathrem = '' slashoff = remain.find('/') if slashoff != -1: pathrem = remain[slashoff:] remain = remain[:slashoff] # detect user[:passwd]@netloc syntax if remain.find('@') != -1: user, remain = remain.rsplit('@', 1) if user.find(':') != -1: user, passwd = user.split(':', 1) ret['passwd'] = urllib.parse.unquote(passwd) ret['user'] = urllib.parse.unquote(user) # remain should be down to host[:port] # detect ipv6 [addr]:port syntax if remain.startswith('['): hostrem, portstr = remain.rsplit(':', 1) ret['port'] = int(portstr) ret['host'] = hostrem[1:-1] # detect ipv6 without port syntax elif remain.count(':') > 1: ret['host'] = remain # regular old host or host:port syntax else: if remain.find(':') != -1: remain, portstr = remain.split(':', 1) ret['port'] = int(portstr) ret['host'] = remain ret['path'] = pathrem return ret
_url_re = regex.compile(r'^(?P<front>.+?://.+?:)[^/]+?(?=@)')
[docs]def sanitizeUrl(url): ''' Returns a URL with the password (if present) replaced with ``****`` RFC 3986 3.2.1 'Applications should not render as clear text any data after the first colon (":") character found within a userinfo subcomponent unless the data after the colon is the empty string (indicating no password)' Essentially, replace everything between the 2nd colon (if it exists) and the first succeeding at sign. Return the original string otherwise. Note: this depends on this being a reasonably-well formatted URI that starts with a scheme (e.g. http) and '//:' Failure of this condition yields the original string. ''' return _url_re.sub(r'\g<front>****', url)