# UMBR14
# Part 05: Household estimation
#
# This section estimates the household population in each small area. It
# takes the annual official population estimates by sex and age, deducts
# the count of communal establishment residents (interpolated between
# Censuses), and multiplies the remaining household population by the
# age-sex specific headship rate in that area.
# 
source("umbr_functions.r")

# LOAD THE COMPILED POPULATION ESTIMATES 
pop.ests <- rbindlist( list( fread("working/popests-lsoa-ew-2001-12.csv"),
                                fread("working/popests-scot-2002_12.csv") ) )
setkeyv(pop.ests, c("Geogcode", "Year"))
setnames(pop.ests, "Count", "Population")


# INSTITUTIONAL POPULATIONS
inst.pops <- rbindlist( list( fread("working/institutional_pops-ew.csv"),
                             fread("working/institutional_pops-sc.csv") ))

# Interpolate intervening years of institutional populations
ipop <- inst.pops[,as.list(seq(from=as.double(Count.01), to=as.double(Count.11), length.out=11)),
                   by=list(Geogcode, Sex.Age)]
setnames(ipop, 3:13, as.character(2001:2011))
ipop[,c("2012") := ipop[,"2011", with=FALSE]] # hold constant for 2012
ipop.l <- melt(ipop, id.vars=c("Geogcode", "Sex.Age"), variable.name="Year")
ipop.l[,Year := as.integer(as.character(Year))]
setnames(ipop.l, "value", "Inst.Pop")

# CALCULATE HEADSHIP RATES
headship  <- rbindlist( list( fread("working/headship_ew.csv"),
                              fread("working/headship_scot.csv") ) )

# National average rates by sex/age
headship.avg <- headship[,
                         list(avg.01=sum(Count.HRP.01)/sum(Count.HHPOP.01),
                              avg.11=sum(Count.HRP.11)/sum(Count.HHPOP.11) ),
                          by=list(Sex.Age) ]
setnames(headship.avg, "Sex.Age", "Sex.Age.Group")
setkey(headship.avg, "Sex.Age.Group")

# Per-area rates
headship[,hrp.rate.01 := Count.HRP.01 / Count.HHPOP.01]
headship[,hrp.rate.11 := Count.HRP.11 / Count.HHPOP.11]

# Because of changing populations & rounding-off errors from summing
# 2001 OAs, It's possible to have LSOAs with nil population in an age
# group, or a HRP population greater than the hhold population. There
# are only a few, and because of the small populations in the age
# groups, they have little influence on the results
#
# Cap their headship rate at 1
headship[hrp.rate.01 > 1, hrp.rate.01 := 1]
headship[hrp.rate.11 > 1, hrp.rate.11 := 1]

# Set missing values to country average
headship[is.na(hrp.rate.01),
         hrp.rate.01 := headship.avg[Sex.Age,"avg.01", with=FALSE]]
headship[is.na(hrp.rate.11),
        hrp.rate.11 :=  headship.avg[Sex.Age,"avg.11", with=FALSE]]

# Interpolate headship rates for inter-censal years 
hrp.all <- headship[,
                  as.list(seq(from=hrp.rate.01, to=hrp.rate.11, length.out=11)),
                  by=list(Geogcode, Sex.Age)]
setnames(hrp.all, 3:13, as.character(2001:2011))
hrp.all[,c("2012") := hrp.all[,"2011", with=FALSE]]
hrp.l <- melt(hrp.all, id.vars=c("Geogcode", "Sex.Age"), variable.name="Year")
hrp.l[,Year := as.integer(as.character(Year))]
setnames(hrp.l, "value", "HRP.Rate")

# THROW THE THREE DATASETS TOGETHER
hhold.est <- merge(pop.ests, ipop.l,
                    by=c("Geogcode", "Year", "Sex.Age") )
hhold.est <- merge(hhold.est, hrp.l, 
                    by=c("Geogcode", "Year", "Sex.Age") )

# THE CALCULATION ITSELF: 
hhold.est[,Hhold.Pop := Population-Inst.Pop]
# A small number of areas, years have an institutional population
# greater than the whole population estimate for corresponding
# year. This applies mainly to early years in areas which have had to be
# combined due to 2011 boundary changes. Set the household population
# for these to nil.
hhold.est[Hhold.Pop<0, Hhold.Pop := 0]
hhold.est[,Hholds := Hhold.Pop * HRP.Rate]

# Aggregation to total households / year
hhold.tots <- hhold.est[,
                        list(Hholds=sum(Hholds)),
                        by=list(Geogcode,Year)]


# PROJECT FORWARD HOUSEHOLD GROWTH RATES TO 2013
# Create a wide dataset with years as columns ("X2001" etc)
hhold.w <- data.frame(dcast(hhold.est, Geogcode~Year, sum, value.var="Hholds"))
# Calculate an annualised growth from such a dataset
ann.growth.rate <- function(df, end.year, span) {
    growth <- df[,sprintf("X%i", end.year)] / hhold.w[,sprintf("X%i", end.year - span)]
    # avoid imaginary parts of cube &c roots of negative numbers
    growth.ann <- sign(growth) * abs(growth) ^ (1/span) 
}
# Can be empirically tested, e.g. here, growth 2011-12 with annualised 2008-2011
# cor(ann.growth.rate(hhold.w, 2011, 1), ann.growth.rate(hhold.w, 2010, 3))

# We'll use the 4-year annualised growth rate
hhold.w$growth.4yr.ann <- ann.growth.rate(hhold.w, 2012, 4)

# Add the projected household estimates for 2013 in
hhold.tots <- rbindlist(list(hhold.tots,
                             data.table(Geogcode=hhold.w$Geogcode,
                                        Year=2013,
                                        Hholds=hhold.w$X2012 * hhold.w$growth.4yr.ann) ) )


# CONSTRAIN TO PUBLISHED ESTIMATES / PROJECTIONS
# Hand-compiled estimates for England, Wales & Scotland by LA district
la.hh.ests <- rbindlist( list( fread("data/hhold_ests-la-compiled-england.csv", header=TRUE),
                               fread("data/hhold_ests-la-compiled-wales.csv", header=TRUE),
                               fread("data/hhold_ests-la-compiled-scotland.csv", header=TRUE) ) )
la.hh.ests <- melt(la.hh.ests, id.vars=c("LAD11CD", "LAD11NM"), value.name="LA.Hholds")
la.hh.ests[,Year:= as.integer(as.character(variable))]

# LSOA/Datazone to LA lookup
lsoa.to.la <- fread("data/OA11_LSOA11_MSOA11_LAD11_EW_LUv2.csv")
lsoa.to.la <- lsoa.to.la[ ! duplicated(LSOA11CD),
                         c("LSOA11CD", "LAD11CD"), with=FALSE]
setnames(lsoa.to.la, 1:2, c("Geogcode", "LAD11CD") )
dz.to.la <- fread("~/data/BORDERS/lookup/scotoa11_to_higher")
dz.to.la <- dz.to.la[ ! duplicated(Datazone2001Code),
                     c("Datazone2001Code", "CouncilArea2011Code"), with=FALSE]
setnames(dz.to.la, 1:2, c("Geogcode", "LAD11CD") )
geog.to.la <- rbindlist( list(lsoa.to.la, dz.to.la) )

hhold.tots <- merge(hhold.tots, geog.to.la, by="Geogcode")
hhold.tots <- merge(hhold.tots, la.hh.ests, by=c("LAD11CD", "Year") )
# Sum of unconstrained estimates per-LA per-year
hhold.tots[, LA.Est.HH := sum(Hholds), by=c("LAD11CD", "Year") ]
# Constrain to official LA totals
hhold.tots[, Hholds.Constrained := Hholds / LA.Est.HH * LA.Hholds]

hhold.final <- hhold.tots[, list(Geogcode=Geogcode,
                                 Year=Year,
                                 Hholds=round(Hholds.Constrained) )]
setkeyv(hhold.final, c("Geogcode", "Year") )

# SAVE THE ESTIMATES
write.csv(hhold.final, "working/household_estimates.csv",
          row.names=FALSE )


# CHECKING - none of this needs to be run
# Some checks on the estimated households - comparison to Census enumeration
hholds.check <- headship[,list(Hholds.01=sum(Count.HRP.01),
                               Hholds.11=sum(Count.HRP.11) ),
                         by=list(Geogcode)]
setkeyv(hholds.check, c("Geogcode"))

hholds.check[,country := substring(Geogcode, 1,1)]
hhold.final[,country := substring(Geogcode, 1,1)]

# Cross-checks against Censal numbers
# We expect the estimates to be higher because they are mid-year
hhold.final[Year==2001, sum(Hholds), by="country"]
hholds.check[,sum(Hholds.01), by="country"]

# Variance by country < 0.2%
hhold.final[Year==2001, sum(Hholds), by="country"]$V1 / hholds.check[,sum(Hholds.01), by="country"]$V1
hhold.final[Year==2011, sum(Hholds), by="country"]$V1 / hholds.check[,sum(Hholds.11), by="country"]$V1

