# UMBR2014
# Part 06: Area reweighting of means-tested benefits
#
# This section calculates the weights that will be used to apportion
# 2001 LSOA DWP benefit counts to their constituent 2001 OAs, so that
# they can then be summed again to 2011 LSOA boundaries
#
# The weighting is derived from two sources:
#
# * a 2001 Census count of adults aged 16-74 who are either unemployed,
#   permanently sick or disabled, or "looking after family" AND not in a
#   couple
# 
# * Experimental statistics currently published for one year (2013)
#   only, with rounded counts of workless benefits for 2001 output areas. 
#
#
# No special conversions are required for Scotland.
# 
# The file uses:
# - the OA 2001 to LSOA 2011 lookups from 1)
# - Census table XXX
# - Means-tested benefit counts for JSA, IS, ESA, PC(GC) (2001-2013)
# - Experimental Universal Credit statistics (2013)
# - Experimental OA-level benefit statistics (2012)
source("umbr_functions.r")

# Load UMBR-2012 edition, for easy access to benefits data
umbr <- fread("../../spatial_analysis/UMBR/umbr_hh-esw-2001-11.csv")


# Selection from 2001 Census table CAS030: econ activity by living arrangements
e.act <- read.csv("econ_act-lsoa-ew-2001-alt.csv", skip=6)

e.act$geogcode <- substr(as.character(e.act[,1]), 1, 9)
# All unemployed adults 16-74
e.act$unemp <- e.act[,2]
# Plus all "permanently sick or disabled"
e.act$unemp.sick <- e.act[,2] + e.act[,5]
# Plus "Looking after home/family" AND "Not living any couple"
e.act$unemp.sick.lpfam <- e.act[,2] + e.act[,4] + e.act[,5]
# Plus all economically inactive students
e.act$unemp.sick.lpfam.stu <- rowSums(e.act[,2:5], na.rm=TRUE )

# Merge with 2001 UMBR 
test <- merge(e.act, umbr[year==2001,], by="geogcode")

# Strength of correlations between UMBR Benefits & 2001 Census indicators
cor(test$unemp, test$All.MTB) # Only unemploment → 0.84
cor(test$unemp.sick, test$All.MTB) # With sick, better → 0.88
cor(test$unemp.sick.lpfam, test$All.MTB) # With single family/carer, the best → .91
cor(test$unemp.sick.lpfam.stu, test$All.MTB) # Much worse with students → 0.78

# Also tested 
# This is 2001 Census table CAS013: Economic activity of HRP
# This is less closely correlated with UMBR Benefits.
# e.act.hrp <- fread("econ_act-lsoa-ew-2001.csv",)

# EXPERIMENTAL OA-LEVEL WORKLESSNESS DATA FROM 2013
#
# Load all the pre-region, per-quarter data on output-area level
# benefits. Note that using calendar year averages across four quarters
# slightly improves accuracy of estimation
oa.bens.tables.13 <- lapply(Sys.glob("data/oa_benefits_2013/OA_Workless_*_*13_Table.csv"), function(csv.file) {
    print(csv.file)
    oa.wk <- fread(csv.file, skip=4, sep=";", header=FALSE)
    oa.wk[,1:6 := NULL] # Delete un-needed columsn
    setnames(oa.wk, 1:7,
             c("geogcode", "OA01CDO", "JSA", "ESA", "LPar", "OTH", "All.Bens"))
    oa.wk
})
# Merge
all.oa.bens <- rbindlist(oa.bens.tables.13)
# 4 quarter average
noworkbens.oa <- all.oa.bens[,list(All.Bens=mean(All.Bens)),
                          by=c("OA01CDO", "geogcode")]
noworkbens.lsoa <- noworkbens.oa[,list(All.Bens=sum(All.Bens)),
                             by=c("geogcode") ]
# Tidy up a bit
rm(oa.bens.tables.13, all.oa.bens)

test <- merge(noworkbens.lsoa, umbr[year == 2001,], by="geogcode")
# Basic correlation with 2001 = 0.92
cor(test$All.Bens, test$All.MTB)

# Make a dataset with both possible sets of indicators
indicators <- merge(e.act, noworkbens.lsoa, by="geogcode")

# UMBR - England & Wales only
umbr.ew <- umbr[geogcode %in% indicators$geogcode,]

# Cycle through the years testing correlations of various possible indicators with UMBR All.MTB
res <- ldply(2001:2011, function(yr) {
    # Various combinations with weighted / unweighted indicators
    unwtd.indic <- indicators$All.Bens + indicators$unemp.sick.lpfam
    wtd.indic.01 <- indicators$All.Bens + ( indicators$unemp.sick.lpfam * ( max(0, 2011 - yr) ) / 10 )
    wtd.indic.02 <- indicators$All.Bens + ( indicators$unemp.sick.lpfam * ( max(0, 2012 - yr) ) / 11 )
    wtd.indic.03 <- indicators$All.Bens + ( indicators$unemp.sick.lpfam * ( max(0, 2013 - yr) ) / 12 )
    data.frame( year=yr,
               bens.only=cor(indicators$All.Bens, umbr.ew[year == yr, All.MTB]),
               cens.only=cor(indicators$unemp.sick.lpfam, umbr.ew[year == yr, All.MTB]),
               unwtd=cor(unwtd.indic, umbr.ew[year == yr, All.MTB]),
               wtd.01=cor(wtd.indic.01, umbr.ew[year == yr, All.MTB]),
               wtd.02=cor(wtd.indic.02, umbr.ew[year == yr, All.MTB]),
               wtd.03=cor(wtd.indic.03, umbr.ew[year == yr, All.MTB]) )
   }
)

# RESULTS
# 1) Combined indicators give better results than either single indicator
# 2) Time-weighting / fading-out the census indicator improves later years
# 3) The best fit among these is #3 
# 4) Time-weighting the OA benefits doesn't improve cor in the early years
# 5) Simple standardising/scaling of the indicators doesn't improve cor 
res
summary(res)
old.res <- res

# SO, TO USE THIS INDICATOR
# OA-level Census data
cas030.oa <- fread("~/data/Census/cas030_econact-oa-selected_indics.csv", skip=7, header=FALSE)
cas030.oa[,2:=NULL]
setnames(cas030.oa, 1:4, c("OA01CDO", "unemp", "lpfam", "sick") )
cas030.oa[,unemp.sick.lpfam := unemp + lpfam + sick]

# Merge the existing benefits data
oa.indic <- merge(cas030.oa, noworkbens.oa, by="OA01CDO")
head(oa.indic)

# Calculate the indicator for each OA for each year
per.year <- lapply(2001:2013, function(yr) {
    data.table(year      = yr,
               OA01CDO   = oa.indic$OA01CDO,
               LSOA01CD  = oa.indic$geogcode,
               mtb.indic = ( oa.indic$All.Bens +
                            oa.indic$unemp.sick.lpfam * max(0, 2013 - yr) / 12 ) )
} )
all.year.indic <- rbindlist(per.year)


# LSOA 2001 totals
all.year.indic.lsoa <- all.year.indic[, list(lsoa.indic=sum(mtb.indic)),
                                        by=c("LSOA01CD", "year") ]
# Calculate OA 2001 shares of per-year LSOA totals
final.indic <- merge(all.year.indic, all.year.indic.lsoa, by=c("LSOA01CD", "year") )
final.indic[, mtb.share := mtb.indic / lsoa.indic]
# One whole LSOA has no claimants at all, fix division by 0
final.indic[is.na(mtb.share),mtb.share := 0]
head(final.indic)
# Save it 
write.csv(final.indic, "working/lsoa_to_oa-benefits_apportionment.csv",
          row.names=FALSE)

