# UMBR2
# Part 01: Geographical lookups
#
# This prepares lookup tables to convert England and Wales 2001 Census
# geographies into 2011 Census geographies. Specifically, it provides
# one-to-one best-fit matching of 2001 Output Areas to 2011 LSOAs. In
# total, 352 of 175,434 2001 OAs are connected to multiple 2011 LSOAs,
# and are assigned to one by a point-in-polygon method.
#
# We implement the following rules:
# 1. Unchanged
# 2. OAs that are subdivided but only into one LSOA
# 3. OAs that are subdivided into multiple LSOAs in the same MSOA
# 4. OAs that are subdivided into multiple LSOAs in multiple MSOAs
#
# No special conversions are required for Scotland, since as at Jun
# 2014, 2011 Census data and population estimates are provided for
# 2001-based Datazones.
#
# The source files required for this stage are all acquired from:
# https://geoportal.statistics.gov.uk/

source("umbr_functions.r")
    
library(rgdal)
library(rgeos)

# The 2001 LSOA lookup (these are unambiguous nestings)
oa01.to.lsoa01 <- fread("~/borders/lookup/oa-2-lsoa_or_dz.csv")
setnames(oa01.to.lsoa01, c("oa_code", "geogcode"), c("OA01CDO", "LSOA01CD"))
setkeyv(oa01.to.lsoa01, c("OA01CDO"))
# ONS Lookup from 2001 OAs to 2011 OAs
oa01.to.oa11 <- fread("data/OA01_OA11_LAD11_EW_LU.csv")
oa01.to.oa11[, c("LAD11CD", "LAD11NM", "LAD11NMW")] <- NULL
setkeyv(oa01.to.oa11, c("OA01CDO", "OA01CD") )

lkup <- merge(oa01.to.lsoa01, oa01.to.oa11, by="OA01CDO")

# ONS Lookup from OA 2011 to LSOA 2011
oa11.to.lsoa11 <- fread("data/OA11_LSOA11_MSOA11_LAD11_EW_LUv2.csv")
oa11.to.lsoa11[,c("LAD11NMW")] <- NULL

# Merge all together 
lkup <- merge(lkup, oa11.to.lsoa11, by="OA11CD")
setkeyv(lkup, c("OA01CD", "LSOA11CD", "MSOA11CD"))

# Check that it has the official numbers of unique items
stopifnot(unql(lkup$OA01CD) == 175434)
stopifnot(unql(lkup$LSOA11CD) == 34753)

# Save lists of unique codes, will need these to test adequacy of
# solution later
oa01.codes   <- unq(lkup$OA01CD)
lsoa01.codes <- unq(lkup$LSOA01CD)
lsoa11.codes <- unq(lkup$LSOA11CD)

# Create a field for the LSOA11 linked to by this OA01
lkup[, LSOA.DEST  := LSOA11CD]
lkup[, GeogChange := "A"]

# These are all the 2001 OAs that are assigned to multiple 2011 OAs
oa01.split <- subset(lkup, CHGIND == "S")

# For each OA, count the number of separate 2011 LSOAs and MSOAs into
# which it is finally assigned
oa01.split[,n.lsoa := length(unique(LSOA11CD)), by="OA01CD"]
oa01.split[,n.msoa := length(unique(MSOA11CD)), by="OA01CD"]
# Are the LSOAs and MSOAs made up only of OAs uniquely assigned to it
oa01.split[,lsoa.any.multi := any(n.lsoa > 1), by="LSOA11CD"]
oa01.split[,msoa.any.multi := any(n.msoa > 1), by="MSOA11CD"]

# FIRST - REALLY complicated splits
oa.split.msoa <- oa01.split[oa01.split$msoa.any.multi,]

# Although they belong to multiple MSOAs, assign each OA01 to one on a
# simple majority vote of how many of its OA11 are part of the final
# MSOA
oa.split.msoa[, oa11.in.msoa := length(unique(OA11CD)), by=c("MSOA11CD", "OA01CD")]
oa.split.msoa[, oa11.from.oa01 := length(unique(OA11CD)), by=c("OA01CD")]
oa.split.msoa[, oa.msoa.share := oa11.in.msoa / oa11.from.oa01 ]
oa.split.msoa[, is.max.share := ( oa.msoa.share == max(oa.msoa.share) ), by="OA01CDO"]
# These are 50/50s, decide arbitrarily
# One in Peterborough
oa.split.msoa[OA01CDO == "00JANR0001" & MSOA11CD == "E02006878", is.max.share := FALSE]
# One in Birmingham
oa.split.msoa[OA01CDO == "00CNFS0003" & MSOA11CD == "E02006898", is.max.share := FALSE]

# Delete all but the strongest links
strong.links <- oa.split.msoa[ is.max.share == TRUE,]
lkup.del <- oa.split.msoa[ is.max.share == FALSE, paste(OA01CD, OA11CD)]
# summary(paste(lkup$OA01CD, lkup$OA11CD) %in% lkup.del)
lkup <- lkup[ ! ( paste(OA01CD, OA11CD) %in% lkup.del ),]
oa01.split <- oa01.split[  ! ( paste(OA01CD, OA11CD) %in% lkup.del ),]

# Recalculate matching without the deleted strong links
oa01.split[,n.lsoa := unql(LSOA11CD), by="OA01CD"]
oa01.split[,n.msoa := unql(MSOA11CD), by="OA01CD"]
# Are the LSOAs and MSOAs made up only of OAs uniquely assigned to it
oa01.split[,lsoa.any.multi := any(n.lsoa > 1), by="LSOA11CD"]
oa01.split[,msoa.any.multi := any(n.msoa > 1), by="MSOA11CD"]

# NOW THOSE WITH SIMPLE MAPPING
splits.simple <- oa01.split[n.lsoa == 1,]
# Identify ALL LSOAs which contain at least some simple split OAs
tgts <- subset(lkup, LSOA11CD %in% splits.simple$LSOA11CD)
# Which of those LSOAs will now receive benefits data from multiple LSOA01 sources
tgts[, n.dest.lsoa := length(unique(LSOA11CD)), by="LSOA01CD"]
# Mapping is simple for these
lkup[LSOA11CD %in% splits.simple$LSOA11CD, LSOA.DEST := LSOA11CD]
lkup[LSOA11CD %in% tgts[n.dest.lsoa > 1, LSOA11CD], GeogChange := "B"]

# MORE COMPLEX MAPPINGS
# Some GIS is helpful in diagnosing these
# Population centroids of 2001 Output Areas
oa01.centroid <- readOGR("/home/alex/borders/EW_OA_2001_centroids", layer="OA_2001_EW_PWC")
# Boundaries of 2011 LSOAs
lsoa11.poly <- readOGR("/home/alex/borders/EW_lsoa_2011", layer="LSOA_2011_EW_BGC")

# OAs that are split into multiple LSOAs within the same MSOA
oa.split.lsoa <- oa01.split[lsoa.any.multi & ! msoa.any.multi,]

# Are all LSOA polygons in the set touching at least one other in the
# set?
touching.together <- function(lsoa.ids) {
    grp.polys <- lsoa11.poly[lsoa11.poly@data$LSOA11CD %in% lsoa.ids,]
    isect <- gIntersects(grp.polys, byid=TRUE)
    all(rowSums(isect) > 1)
}

# The population centroid is nearest to which LSOA?
whose.nearest <- function(oa.id, lsoa.ids) {
    ctr <- oa01.centroid[oa01.centroid@data$OA01CD == oa.id,]
    grp.polys <- lsoa11.poly[lsoa11.poly@data$LSOA11CD %in% lsoa.ids,]
    dists <- gDistance(ctr, grp.polys, byid=TRUE)
    as.character(grp.polys@data[dists == min(dists), "LSOA11CD"])
}

# Look at the these, check they are contiguous within the MSOA
oa.split.lsoa[, contiguous := touching.together(LSOA11CD), by="MSOA11CD"]
summary(oa.split.lsoa$contiguous)

# Set their target to one of the LSOAs within that MSOA
oa.split.lsoa[, LSOA.DEST := LSOA11CD[1], by="MSOA11CD"]

# Update the main lookup table
for ( comb.lsoa in unique(oa.split.lsoa$LSOA11CD) ) {
    lkup[LSOA11CD == comb.lsoa,
             LSOA.DEST := oa.split.lsoa[LSOA11CD == comb.lsoa, unique(LSOA.DEST)]]
    lkup[LSOA11CD == comb.lsoa,
             GeogChange := "C"]
}
lkup[GeogChange == "C" & ( LSOA.DEST != LSOA11CD ), GeogChange := "Cx"]

# Are all OA01 present and unambiguously assigned to no more than
# LSOA.DEST>?
stopifnot( unql(lkup$OA01CD) == unql(oa01.codes))
uniqueness <- lkup[, unql(LSOA.DEST), by="OA01CD"]
stopifnot(all(uniqueness$V1 == 1))
# Are all LSOA11CD uniquely assigned to 1 destination
uniqueness <- lkup[, length(unique(LSOA.DEST)), by="LSOA11CD"]
stopifnot(all(uniqueness$V1 == 1))

# First the conversion table for all 2001 OAs to a single 2011 LSOA
to.output <- lkup[!duplicated(OA01CD),c("OA01CD", "OA01CDO", "LSOA01CD", "LSOA.DEST"), with=FALSE]
setkey(to.output, "OA01CD")
setnames(to.output, "LSOA.DEST", "LSOA11CD")
write.csv(to.output, "working/convert-oa01_to_lsoa11.csv", row.names=FALSE)

# Then the cross-matching and quality indicator within 2011 LSOAs
to.output <- lkup[!duplicated(LSOA11CD),
                  c("LSOA11CD", "LSOA11NM", "LSOA.DEST", "MSOA11CD", "GeogChange"),
                  with=FALSE]

# 4 missing cases, added back in manually with matches from visual inspection
missing <- nin(lsoa11.codes, to.output$LSOA11CD)
unq(missing)
# These manual assignments
# E01033618: Birmingham 137C → 137E
# E01032701: Swindon 027A → 027B
# E01032704: Swindon 027C → 027B
# E01033174: Pboro 022C → 022D; set 022D as "C" class
to.output <- rbind(data.frame(to.output),
                   c("E01033618","Birmingham 137C","E01033565","E02006898","Cx"),
                   c("E01032701","Swindon 027A","E01032702","E02006848","Cx"),
                   c("E01032704","Swindon 027C","E01032702","E02006848","Cx"),
                   c("E01033174","Peterborough 022C","E01033179","E02006877","Cx") )
to.output[to.output$LSOA11NM == "Peterborough 022D", "GeogChange"] <- "C"

to.output <- to.output[order(to.output$LSOA11CD),]
write.csv(to.output, "working/lsoa11-matches.csv", row.names=FALSE)





