# Module     : postcode.py
# Synopsis   : UK postcode parser
# Programmer : Simon Brunning - simon@brunningonline.net
# Date       : 14 April 2004
# Version    : 1.0
# Copyright  : Released to the public domain. Provided as-is, with no warranty.
# Notes      :
'''UK postcode parser

Provides the parse_uk_postcode function for parsing UK postcodes.'''

import re

# Build up the regex patterns piece by piece
POSTAL_ZONES = ['AB', 'AL', 'B' , 'BA', 'BB', 'BD', 'BH', 'BL', 'BN', 'BR',
                'BS', 'BT', 'CA', 'CB', 'CF', 'CH', 'CM', 'CO', 'CR', 'CT',
                'CV', 'CW', 'DA', 'DD', 'DE', 'DG', 'DH', 'DL', 'DN', 'DT',
                'DY', 'E' , 'EC', 'EH', 'EN', 'EX', 'FK', 'FY', 'G' , 'GL',
                'GY', 'GU', 'HA', 'HD', 'HG', 'HP', 'HR', 'HS', 'HU', 'HX',
                'IG', 'IM', 'IP', 'IV', 'JE', 'KA', 'KT', 'KW', 'KY', 'L' ,
                'LA', 'LD', 'LE', 'LL', 'LN', 'LS', 'LU', 'M' , 'ME', 'MK',
                'ML', 'N' , 'NE', 'NG', 'NN', 'NP', 'NR', 'NW', 'OL', 'OX',
                'PA', 'PE', 'PH', 'PL', 'PO', 'PR', 'RG', 'RH', 'RM', 'S' ,
                'SA', 'SE', 'SG', 'SK', 'SL', 'SM', 'SN', 'SO', 'SP', 'SR',
                'SS', 'ST', 'SW', 'SY', 'TA', 'TD', 'TF', 'TN', 'TQ', 'TR',
                'TS', 'TW', 'UB', 'W' , 'WA', 'WC', 'WD', 'WF', 'WN', 'WR',
                'WS', 'WV', 'YO', 'ZE']
POSTAL_ZONES_ONE_CHAR = [zone for zone in POSTAL_ZONES if len(zone) == 1]
POSTAL_ZONES_TWO_CHARS = [zone for zone in POSTAL_ZONES if len(zone) == 2]
THIRD_POS_CHARS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'S',
                   'T', 'U', 'W']
FOURTH_POS_CHARS = ['A', 'B', 'E', 'H', 'M', 'N', 'P', 'R', 'V', 'W', 'X',
INCODE_CHARS = ['A', 'B', 'D', 'E', 'F', 'G', 'H', 'J', 'L', 'N', 'P', 'Q',
                'R', 'S', 'T', 'U', 'W', 'X', 'Y', 'Z']
                   r'(?:(?:' +
                   '|'.join(POSTAL_ZONES_ONE_CHAR) +
                   r')(?:\d[' +
                   ''.join(THIRD_POS_CHARS) +
                   r']|\d{1,2}))' +
                   r'|' +
                   r'(?:(?:' +
                   '|'.join(POSTAL_ZONES_TWO_CHARS) +
                   r')(?:\d[' +
                   ''.join(FOURTH_POS_CHARS) +
                   r']|\d{1,2}))' +
INCODE_PATTERN = (r'(\d[' +
                  ''.join(INCODE_CHARS) +
                  r'][' +
                  ''.join(INCODE_CHARS) +

# Compile regexs

def parse_uk_postcode(postcode, strict=True, incode_mandatory=True):
    '''Split UK postcode into outcode and incode portions.

    postcode            The postcode to be split.
    strict              If true, the postcode will be validated according to
                        the rules as specified at the Universal Postal Union[1]
                        and The UK Government Data Standards Catalogue[2]. If
                        the supplied postcode doesn't adhere to these rules a
                        ValueError will be thrown.
    incode_mandatory    If true, and only an outcode has been supplied, the
                        function will throw a ValueError.

    Returns:            outcode, incode
    Raises:             ValueError, if postcode is longer than seven
                        characters, or if 'strict' or 'incode_mandatory'
                        conditions are broken - see above.

    Usage example:      >>> from postcode import parse_uk_postcode
                        >>> parse_uk_postcode('cr0 2yr')
                        ('CR0', '2YR')
                        >>> parse_uk_postcode('cr0')
                        Traceback (most recent call last):
                          File "<interactive input>", line 1, in ?
                          File "postcode.py", line 101, in parse_uk_postcode
                            raise ValueError('Incode mandatory')
                        ValueError: Incode mandatory
                        >>> parse_uk_postcode('cr0', False, False)
                        ('CR0', '')
    [1] http://www.upu.int/post_code/en/countries/GBR.pdf
    [2] http://www.govtalk.gov.uk/gdsc/html/noframes/PostCode-2-1-Release.htm
    postcode = postcode.replace(' ', '').upper() # Normalize
    if len(postcode) > 7:
        raise ValueError('Incode mandatory')
    # Validate postcode
    if strict:
        # Try for full postcode match
        postcode_match = POSTCODE_REGEX.match(postcode)
        if postcode_match:
            return postcode_match.group(1, 2)
        # Try for outcode only match
        outcode_match = STANDALONE_OUTCODE_REGEX.match(postcode)
        if outcode_match:
            if incode_mandatory:
                raise ValueError('Incode mandatory')
                return outcode_match.group(1), ''

        # Try Girobank special case
        if postcode == 'GIR0AA':
            return 'GIR', '0AA'
        elif postcode == 'GIR':
            if incode_mandatory:
                raise ValueError('Incode mandatory')
                return 'GIR', ''
        # None of the above
        raise ValueError('Invalid postcode')
    # Just chop up whatever we've been given.
        # Outcode only
        if len(postcode) <= 4:
            if incode_mandatory:
                raise ValueError('Incode mandatory')
                return postcode, ''
        # Full postcode
            return postcode[:-3], postcode[-3:]

if __name__ == '__main__':
    print 'Self test:'
    test_data = [
                 ('cr0 2yr' , False, False, ('CR0' , '2YR')),
                 ('CR0 2YR' , False, False, ('CR0' , '2YR')),
                 ('cr02yr'  , False, False, ('CR0' , '2YR')),
                 ('dn16 9aa', False, False, ('DN16', '9AA')),
                 ('dn169aa' , False, False, ('DN16', '9AA')),
                 ('ec1a 1hq', False, False, ('EC1A', '1HQ')),
                 ('ec1a1hq' , False, False, ('EC1A', '1HQ')),
                 ('m2 5bq'  , False, False, ('M2'  , '5BQ')),
                 ('m25bq'   , False, False, ('M2'  , '5BQ')),
                 ('m34 4ab' , False, False, ('M34' , '4AB')),
                 ('m344ab'  , False, False, ('M34' , '4AB')),
                 ('sw19 2et', False, False, ('SW19', '2ET')),
                 ('sw192et' , False, False, ('SW19', '2ET')),
                 ('w1a 4zz' , False, False, ('W1A' , '4ZZ')),
                 ('w1a4zz'  , False, False, ('W1A' , '4ZZ')),
                 ('cr0'     , False, False, ('CR0' , ''   )),
                 ('sw19'    , False, False, ('SW19', ''   )),
                 ('xx0 2yr' , False, False, ('XX0' , '2YR')),
                 ('3r0 2yr' , False, False, ('3R0' , '2YR')),
                 ('20 2yr'  , False, False, ('20'  , '2YR')),
                 ('3r0 ayr' , False, False, ('3R0' , 'AYR')),
                 ('3r0 22r' , False, False, ('3R0' , '22R')),
                 ('w1m 4zz' , False, False, ('W1M' , '4ZZ')),
                 ('3r0'     , False, False, ('3R0' , ''   )),
                 ('ec1c 1hq', False, False, ('EC1C', '1HQ')),
                 ('m344cb'  , False, False, ('M34' , '4CB')),
                 ('gir 0aa' , False, False, ('GIR' , '0AA')),
                 ('gir'     , False, False, ('GIR' , ''   )),
                 ('w1m 4zz' , False, False, ('W1M' , '4ZZ')),
                 ('w1m'     , False, False, ('W1M' , ''   )),
                 ('dn169aaA', False, False, 'ValueError'   ),
                 ('cr0 2yr' , False, True , ('CR0',  '2YR')),
                 ('CR0 2YR' , False, True , ('CR0' , '2YR')),
                 ('cr02yr'  , False, True , ('CR0',  '2YR')),
                 ('dn16 9aa', False, True , ('DN16', '9AA')),
                 ('dn169aa' , False, True , ('DN16', '9AA')),
                 ('ec1a 1hq', False, True , ('EC1A', '1HQ')),
                 ('ec1a1hq' , False, True , ('EC1A', '1HQ')),
                 ('m2 5bq'  , False, True , ('M2'  , '5BQ')),
                 ('m25bq'   , False, True , ('M2'  , '5BQ')),
                 ('m34 4ab' , False, True , ('M34' , '4AB')),
                 ('m344ab'  , False, True , ('M34' , '4AB')),
                 ('sw19 2et', False, True , ('SW19', '2ET')),
                 ('sw192et' , False, True , ('SW19', '2ET')),
                 ('w1a 4zz' , False, True , ('W1A' , '4ZZ')),
                 ('w1a4zz'  , False, True , ('W1A' , '4ZZ')),
                 ('cr0'     , False, True , 'ValueError'   ),
                 ('sw19'    , False, True , 'ValueError'   ),
                 ('xx0 2yr' , False, True , ('XX0' , '2YR')),
                 ('3r0 2yr' , False, True , ('3R0' , '2YR')),
                 ('20 2yr'  , False, True , ('20'  , '2YR')),
                 ('3r0 ayr' , False, True , ('3R0' , 'AYR')),
                 ('3r0 22r' , False, True , ('3R0' , '22R')),
                 ('w1m 4zz' , False, True , ('W1M' , '4ZZ')),
                 ('3r0'     , False, True , 'ValueError'   ),
                 ('ec1c 1hq', False, True , ('EC1C', '1HQ')),
                 ('m344cb'  , False, True , ('M34' , '4CB')),
                 ('gir 0aa' , False, True , ('GIR' , '0AA')),
                 ('gir'     , False, True , 'ValueError'   ),
                 ('w1m 4zz' , False, True , ('W1M' , '4ZZ')),
                 ('w1m'     , False, True , 'ValueError'   ),
                 ('dn169aaA', False, True , 'ValueError'   ),
                 ('cr0 2yr' , True , False, ('CR0' , '2YR')),
                 ('CR0 2YR' , True , False, ('CR0' , '2YR')),
                 ('cr02yr'  , True , False, ('CR0' , '2YR')),
                 ('dn16 9aa', True , False, ('DN16', '9AA')),
                 ('dn169aa' , True , False, ('DN16', '9AA')),
                 ('ec1a 1hq', True , False, ('EC1A', '1HQ')),
                 ('ec1a1hq' , True , False, ('EC1A', '1HQ')),
                 ('m2 5bq'  , True , False, ('M2'  , '5BQ')),
                 ('m25bq'   , True , False, ('M2'  , '5BQ')),
                 ('m34 4ab' , True , False, ('M34' , '4AB')),
                 ('m344ab'  , True , False, ('M34' , '4AB')),
                 ('sw19 2et', True , False, ('SW19', '2ET')),
                 ('sw192et' , True , False, ('SW19', '2ET')),
                 ('w1a 4zz' , True , False, ('W1A' , '4ZZ')),
                 ('w1a4zz'  , True , False, ('W1A' , '4ZZ')),
                 ('cr0'     , True , False, ('CR0' , ''   )),
                 ('sw19'    , True , False, ('SW19', ''   )),
                 ('xx0 2yr' , True , False, 'ValueError'   ),
                 ('3r0 2yr' , True , False, 'ValueError'   ),
                 ('20 2yr'  , True , False, 'ValueError'   ),
                 ('3r0 ayr' , True , False, 'ValueError'   ),
                 ('3r0 22r' , True , False, 'ValueError'   ),
                 ('w1m 4zz' , True , False, 'ValueError'   ),
                 ('3r0'     , True , False, 'ValueError'   ),
                 ('ec1c 1hq', True , False, 'ValueError'   ),
                 ('m344cb'  , True , False, 'ValueError'   ),
                 ('gir 0aa' , True , False, ('GIR' , '0AA')),
                 ('gir'     , True , False, ('GIR' , ''   )),
                 ('w1m 4zz' , True , False, 'ValueError'   ),
                 ('w1m'     , True , False, 'ValueError'   ),
                 ('dn169aaA', True , False, 'ValueError'   ),
                 ('cr0 2yr' , True , True , ('CR0',  '2YR')),
                 ('CR0 2YR' , True , True , ('CR0' , '2YR')),
                 ('cr02yr'  , True , True , ('CR0',  '2YR')),
                 ('dn16 9aa', True , True , ('DN16', '9AA')),
                 ('dn169aa' , True , True , ('DN16', '9AA')),
                 ('ec1a 1hq', True , True , ('EC1A', '1HQ')),
                 ('ec1a1hq' , True , True , ('EC1A', '1HQ')),
                 ('m2 5bq'  , True , True , ('M2'  , '5BQ')),
                 ('m25bq'   , True , True , ('M2'  , '5BQ')),
                 ('m34 4ab' , True , True , ('M34' , '4AB')),
                 ('m344ab'  , True , True , ('M34' , '4AB')),
                 ('sw19 2et', True , True , ('SW19', '2ET')),
                 ('sw192et' , True , True , ('SW19', '2ET')),
                 ('w1a 4zz' , True , True , ('W1A' , '4ZZ')),
                 ('w1a4zz'  , True , True , ('W1A' , '4ZZ')),
                 ('cr0'     , True , True , 'ValueError'   ),
                 ('sw19'    , True , True , 'ValueError'   ),
                 ('xx0 2yr' , True , True , 'ValueError'   ),
                 ('3r0 2yr' , True , True , 'ValueError'   ),
                 ('20 2yr'  , True , True , 'ValueError'   ),
                 ('3r0 ayr' , True , True , 'ValueError'   ),
                 ('3r0 22r' , True , True , 'ValueError'   ),
                 ('w1m 4zz' , True , True , 'ValueError'   ),
                 ('3r0'     , True , True , 'ValueError'   ),
                 ('ec1c 1hq', True , True , 'ValueError'   ),
                 ('m344cb'  , True , True , 'ValueError'   ),
                 ('gir 0aa' , True , True , ('GIR' , '0AA')),
                 ('gir'     , True , True , 'ValueError'   ),
                 ('w1m 4zz' , True , True , 'ValueError'   ),
                 ('w1m'     , True , True , 'ValueError'   ),
                 ('dn169aaA', True , True , 'ValueError'   ),
    passes, failures = 0, 0
    for postcode, strict, incode_mandatory, required_result in test_data:
            actual_result = parse_uk_postcode(postcode, strict, incode_mandatory)
        except ValueError:
            actual_result = 'ValueError'
        if actual_result != required_result:
            failures += 1
            print 'Failed:', repr(actual_result), '!=', repr(required_result), \
                  'for input postcode =', repr(postcode) + \
                  ', strict =', repr(strict) + \
                  ', incode_mandatory =', repr(incode_mandatory)
            passes += 1
    if failures:
        print failures, "failures. :-("
        print passes, "passed."
        print passes, "passed! ;-)"