|
| 1 | +import re |
| 2 | +import calendar |
| 3 | +import six |
| 4 | + |
| 5 | +RFC3339_REGEX_FLAGS = 0 |
| 6 | +if six.PY3: |
| 7 | + RFC3339_REGEX_FLAGS |= re.ASCII |
| 8 | + |
| 9 | +RFC3339_REGEX = re.compile(r""" |
| 10 | + ^ |
| 11 | + (\d{4}) # Year |
| 12 | + - |
| 13 | + (0[1-9]|1[0-2]) # Month |
| 14 | + - |
| 15 | + (\d{2}) # Day |
| 16 | + T |
| 17 | + (?:[01]\d|2[0123]) # Hours |
| 18 | + : |
| 19 | + (?:[0-5]\d) # Minutes |
| 20 | + : |
| 21 | + (?:[0-5]\d) # Seconds |
| 22 | + (?:\.\d+)? # Secfrac |
| 23 | + (?: Z # UTC |
| 24 | + | [+-](?:[01]\d|2[0123]):[0-5]\d # Offset |
| 25 | + ) |
| 26 | + $ |
| 27 | +""", re.VERBOSE | RFC3339_REGEX_FLAGS) |
| 28 | + |
| 29 | + |
| 30 | +def validate_rfc3339(date_string): |
| 31 | + """ |
| 32 | + Validates dates against RFC3339 datetime format |
| 33 | + Leap seconds are no supported. |
| 34 | + """ |
| 35 | + m = RFC3339_REGEX.match(date_string) |
| 36 | + if m is None: |
| 37 | + return False |
| 38 | + year, month, day = map(int, m.groups()) |
| 39 | + if not year: |
| 40 | + # Year 0 is not valid a valid date |
| 41 | + return False |
| 42 | + (_, max_day) = calendar.monthrange(year, month) |
| 43 | + if not 1 <= day <= max_day: |
| 44 | + return False |
| 45 | + return True |
| 46 | + |
| 47 | + |
| 48 | +# Following regex rules references the ABNF terminology from |
| 49 | +# [RFC3986](https://tools.ietf.org/html/rfc3986#appendix-A) |
| 50 | + |
| 51 | +# IPv6 validation rule |
| 52 | +IPv6_RE = ( |
| 53 | + r"(?:(?:[0-9A-Fa-f]{1,4}:){6}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][" |
| 54 | + r"0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|::(?:[0-9A-Fa-f]{1,4}:){5}(?:[0-9A-Fa-f]{1," |
| 55 | + r"4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][" |
| 56 | + r"0-9]?))|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[" |
| 57 | + r"0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1," |
| 58 | + r"4}:)?[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][" |
| 59 | + r"0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){," |
| 60 | + r"2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][" |
| 61 | + r"0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){," |
| 62 | + r"3}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:)(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[" |
| 63 | + r"01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){,4}[0-9A-Fa-f]{1," |
| 64 | + r"4})?::(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[" |
| 65 | + r"0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[" |
| 66 | + r"0-9A-Fa-f]{1,4}:){,6}[0-9A-Fa-f]{1,4})?::)" |
| 67 | +) |
| 68 | + |
| 69 | + |
| 70 | +# An authority is defined as: [ userinfo "@" ] host [ ":" port ] |
| 71 | +AUTHORITY_RE = r""" |
| 72 | + (?:(?:[a-zA-Z0-9_.~\-!$&'()*+,;=:]|%[0-9A-Fa-f]{{2}})*@)? # user info |
| 73 | + (?: |
| 74 | + \[(?:{ip_v6}|v[0-9A-Fa-f]+\.[a-zA-Z0-9_.~\-!$&'()*+,;=:]+)\] # IP-literal |
| 75 | + | (?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){{3}}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) # IPv4 |
| 76 | + | (?:[a-zA-Z0-9_.~\-!$&'()*+,;=]|%[0-9A-Fa-f]{{2}})* # reg-name |
| 77 | + ) # host |
| 78 | + (?::[0-9]*)? # port |
| 79 | +""".format(ip_v6=IPv6_RE,) |
| 80 | +# Path char regex rule |
| 81 | +PCHAR_RE = r"(?:[a-zA-Z0-9_.~\-!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})" |
| 82 | +# Query and Fragment rules are exactly the same |
| 83 | +QUERY_RE = r"(?:[a-zA-Z0-9_.~\-!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*" |
| 84 | +# An URI is defined as: scheme ":" hier-part [ "?" query ] [ "#" fragment ] |
| 85 | +URI_RE = r""" |
| 86 | + [a-zA-Z][a-zA-Z0-9+.-]* #scheme |
| 87 | + : |
| 88 | + (?: |
| 89 | + // |
| 90 | + {authority} |
| 91 | + (?:/{pchar}*)* # path-abempty |
| 92 | + | /(?:{pchar}+ (?:/{pchar}*)*)? # path-absolute |
| 93 | + | {pchar}+ (?:/{pchar}*)* # path-rootless |
| 94 | + | # or nothing |
| 95 | + ) # hier-part |
| 96 | + (?:\?{query})? # Query |
| 97 | + (?:\#{fragment})? # Fragment |
| 98 | +""".format( |
| 99 | + authority=AUTHORITY_RE, |
| 100 | + query=QUERY_RE, |
| 101 | + fragment=QUERY_RE, |
| 102 | + pchar=PCHAR_RE |
| 103 | +) |
| 104 | + |
| 105 | +# A relative-ref is defined as: relative-part [ "?" query ] [ "#" fragment ] |
| 106 | +RELATIVE_REF_RE = r""" |
| 107 | + (?: |
| 108 | + // |
| 109 | + {authority} |
| 110 | + (?:/{pchar}*)* # path-abempty |
| 111 | + | /(?:{pchar}+ (?:/{pchar}*)*)? # path-absolute |
| 112 | + | (?:[a-zA-Z0-9_.~\-!$&'()*+,;=@]|%[0-9A-Fa-f]{{2}})+ (?:/{pchar}*)* # path-noscheme |
| 113 | + | # or nothing |
| 114 | + ) # relative-part |
| 115 | + (?:\?{query})? # Query |
| 116 | + (?:\#{fragment})? # Fragment |
| 117 | +""".format( |
| 118 | + authority=AUTHORITY_RE, |
| 119 | + query=QUERY_RE, |
| 120 | + fragment=QUERY_RE, |
| 121 | + pchar=PCHAR_RE |
| 122 | +) |
| 123 | +# Compiled URI regex rule |
| 124 | +URI_RE_COMP = re.compile(r"^{uri_re}$".format(uri_re=URI_RE), re.VERBOSE) |
| 125 | +# Compiled URI-reference regex rule. URI-reference is defined as: URI / relative-ref |
| 126 | +URI_REF_RE_COMP = re.compile(r"^(?:{uri_re}|{relative_ref})$".format( |
| 127 | + uri_re=URI_RE, |
| 128 | + relative_ref=RELATIVE_REF_RE, |
| 129 | +), re.VERBOSE) |
| 130 | + |
| 131 | + |
| 132 | +def validate_rfc3986(url, rule='URI'): |
| 133 | + if rule == 'URI': |
| 134 | + return URI_RE_COMP.match(url) |
| 135 | + elif rule == 'URI_reference': |
| 136 | + return URI_REF_RE_COMP.match(url) |
| 137 | + else: |
| 138 | + raise ValueError('Invalid rule') |
0 commit comments