# UMBR14
# Part 02: Group & compiling population estimates
# 
# This part compiles the published population estimates from separate
# year / sex files and builds them into a single dataset (for now,
# separately for England & Wales and for Scotland).
#
# The published age groups (annual in E&W, 5-yearly in Sc) are compiled
# to age/sex groups that will be used in the household estimation
# method, which are dictated by the categories in the published Census
# tables for institional populations and age/sex of household reference
# persons.
#
# In England and Wales, the data for published 2011 LSOAs is re-summed
# where necessary to the final output LSOAs that will be used, where
# multiple LSOAs data will be represented by a single LSOA in the final
# output dataset

# data.table is MUCH faster at aggregating data
source("umbr_functions.r")

# # ENGLAND AND WALES # # 
# LSOA Population Estimates
# The age bands used are determined by those categories used in the 2011 Census Tables
age.cats.ew <- list("01_15" =  0:15, "16_24" = 16:24, "25_34" = 25:34,
                    "35_49" = 35:49, "50_64" = 50:64, "65_plus" = 65:89)

age.cats.m <- lapply(age.cats.ew, function(range) paste("m", range, sep="") )

age.cats.m[["65_plus"]] <- c(age.cats.m[["65_plus"]], "m90plus")
names(age.cats.m) <- paste("M", names(age.cats.m), sep=".")

age.cats.f <- lapply(age.cats.ew, function(range) paste("f", range, sep="") )
age.cats.f[["65_plus"]] <- c(age.cats.f[["65_plus"]], "f90plus")
names(age.cats.f) <- paste("F", names(age.cats.f), sep=".")

# Males - Population Estimates
males <- regroup.and.merge(2001:2011, age.cats.m, "LSOA11CD",
                           function(yr) read.csv(sprintf("data/popests/eng_wales/males-mid%i.csv", yr) ) )
# For some unknown reason, this is formatted with different table headers, has to be dealt with separately
m.2012 <- read.csv("data/popests/eng_wales/males-mid2012.csv")
colnames(m.2012)[4:94] <- c( paste("m", 0:89, sep=""), "m90plus")
m.2012.l <- sum.regroup(m.2012, age.cats.m, "Code")
m.2012.l$Year <- 2012
males <- rbind(males, m.2012.l)


females <- regroup.and.merge(2001:2011, age.cats.f, "LSOA11CD",
                         function(yr) read.csv(sprintf("data/popests/eng_wales/females-mid%i.csv", yr) ) )
# Ditto, the female 2012 is inconsistently labelled
f.2012 <- read.csv("data/popests/eng_wales/females-mid2012.csv")
colnames(f.2012)[4:94] <- c( paste("f", 0:89, sep=""), "f90plus")
f.2012.l <- sum.regroup(f.2012, age.cats.f, "Code")
f.2012.l$Year <- 2012
females <- rbind(females, f.2012.l)

# Finally, re-sum to aggregated LSOAs
lsoa2lsoa <- fread("working/lsoa11-matches.csv")
m.f <- rbind(data.table(males), data.table(females))

# Using data.table (This takes a LOOONG time using basic aggregate or plyr)
all.pop <- m.f[,list(Count=sum(Count)),
                  by=list(Geogcode=lsoa2lsoa[match(m.f$Geogcode, lsoa2lsoa$LSOA11CD),LSOA.DEST],
                      Year=Year,
                      Sex.Age=Sex.Age)
               ]

write.csv(all.pop, "working/popests-lsoa-ew-2001-12.csv", row.names=FALSE)
head(all.pop)

### SCOTLAND
SCOT_POPEST_BANDS <- list("0_15" = c("0_4", "5_9", "10_14"),
                          "16_24" = c("15_19", "20_24"),
                          "25_49" = c("25_29", "30_34", "35_39", "40_44", "45_49"),
                          "50_64" = c("50_54", "55_59", "60_64"),
                          "65_plus" = c("65_69", "70_74", "75_79", "80_84", "85_89", "90_plus") )


scot.pope.base.dir <- "data/popests/scot/"
age.grps.sc <- c("All.Ages", paste( seq(0,85,5), seq(4,89,5), sep="_"), "90_plus")

# The scottish population estimate files are very messy and badly
# formatted, they have to be read carefully and tidied.
# year - year to be read, sex - sex to be read
# junk.cols - columns to be blindly deleted (different in 2001)
# junk.rows - rows to be ignored
# nrows - then number of rows to be read
read.scot <- function(year, sex,
                      junk.cols,
                      junk.rows,
                      total.cols=27,
                      not.junk.rows=6505 ) {
    popf <- Sys.glob(sprintf("%s*%i*%s*.csv", scot.pope.base.dir, year, sex))[1]
    pop.est <- fread(popf,
                     skip=junk.rows, header=FALSE, nrows=not.junk.rows,
                     colClasses=rep("character", total.cols) )
    # Delete junk columns
    pop.est[,junk.cols] <- NULL
    pop.est <- clean.scot(pop.est)
    age.grps <- paste(toupper(sex), age.grps.sc, sep=".")
    setnames(pop.est, 2:21, age.grps)
    pop.est$Year <- year
    pop.est
}

# Cleans and converts the strings to integers     
clean.scot <- function(pop.est) {
    pop.est.df <- data.frame(pop.est)
    data.table(Datazone=pop.est.df[,1],
               apply(pop.est.df[,2:21], 2,
                     function(c) as.integer(gsub(",", "", c) ) ) )
}

# Male and female column titles
age.cats.m <- lapply(SCOT_POPEST_BANDS, function(range) paste("M", range, sep=".") )
names(age.cats.m) <- paste("M", names(age.cats.m), sep=".")

age.cats.f <- lapply(SCOT_POPEST_BANDS, function(range) paste("F", range, sep=".") )
names(age.cats.f) <- paste("F", names(age.cats.f), sep=".")

junk.cols.01 <- c(3,23:24)
# 2001 estimates are not formatted consistently, have to be done separately
# I know the year looks funny - but the file has a strange name
m.2001 <- read.scot(104, "m", junk.cols=junk.cols.01, junk.rows=5, total.cols=24)
m.2001$Year <- 2001
f.2001 <- read.scot(104, "f", junk.cols=junk.cols.01, junk.rows=5, total.cols=24)
f.2001$Year <- 2001

junk.cols.0212 <- c(3,13,14,15,26,27)
# Population Estimates for men and women
# The 0 is just a horrible place holder for m.2001
male.popes <- c(0, lapply(2002:2012, read.scot, "m", junk.cols.0212, 6) )
male.popes[[1]] <- m.2001
male.popes <- lapply(male.popes, sum.regroup.dt, age.cats.m, "Datazone")
# Add in the year flag
mapply( function(dt, yr) dt[,Year := yr], male.popes, 2001:2012)

female.popes <- c(0, lapply(2002:2012, read.scot, "f", junk.cols.0212, 6) )
female.popes[[1]] <- f.2001
female.popes <- lapply(female.popes, sum.regroup.dt, age.cats.f, "Datazone")
mapply( function(dt, yr) dt[,Year := yr], female.popes, 2001:2012)

all.popes <- rbindlist(list(rbindlist(male.popes),
                            rbindlist(female.popes)))

# Note that the Scottish population estimates look somewhat odd in
# 2001-2003 - very little increase - even though 2002 and 2003 are
# definitely from the same series of revised forecasts

# Order so consistent with England & Wales
setcolorder(all.popes, c("Geogcode", "Year", "Sex.Age", "Count") )
write.csv(all.popes, "working/popests-scot-2002_12.csv", row.names=FALSE)

