library(data.table)

# Should convert functions using these to use data.table instead
library(plyr)
library(reshape2)

# Shortcut for unique
unq  <- function(elems) unique(elems)
# Number of unique elements in list
unql <- function(elems) length(unique(elems))
# not-in
nin  <- function(a, b) a[ !(a %in% b)]


# Function to sum columns according to a specification (e.g. producing
# new broader age bands out of single-year or five-year age bands in
# Census or small-area pop data. Returns a data.frame with the grouped
# columns as col.names and the row.names from a specific geography
# column.
#
# + Age bands is a list with the names of the bands as values, and the
# corresponding columns that should be summed as the values
# + geog.col is the name of a column identifying areas
sum.regroup <- function(df, age.bands, geog.col) {
    groupd <- t( ldply( names(age.bands),
                       function(b) rowSums(df[, age.bands[[b]]]) ) )
    rownames(groupd) <- df[,geog.col]
    colnames(groupd) <- names(age.bands)
    group.l <- melt(groupd)
    colnames(group.l) <- c("Geogcode", "Sex.Age", "Count")
    group.l
}

# As above, but for data.table
sum.regroup.dt <- function(dt, age.bands, geog.col) {
    groupd <- dt[, lapply(age.bands,
                          function(ab) rowSums(dt[,ab,with=FALSE]))]
    groupd$Geogcode <- dt[,geog.col,with=FALSE]
    group.l <- melt(groupd, id.vars="Geogcode")
    setnames(group.l, 1:3, c("Geogcode", "Sex.Age", "Count"))
    group.l
}

# Function to compile many years acccording to a specification, and provide flattened data
# 
# + Read function is a function that will be passed the year, it should
# return the raw data
#
regroup.and.merge <- function(year.range, age.bands, geog.col, read.func) {
    yr.list <- lapply(year.range, function(yr) {
        raw <- read.func(yr)
        gpd.long <- sum.regroup(raw, age.bands, geog.col) 
        gpd.long$Year <- yr
        gpd.long
    })
    do.call(rbind, yr.list)
}

# CENSUS TABLE definitions
# Column names for CAS001
CAS001.AGES <- c("ALL", "0_4", 0:4, "5_9", 5:9,
                 "10_14", 10:14, "15_19", 15:19, "20_24", 20:24,
                 sprintf("%i_%i", seq(25,85,5), seq(29,89,5)),
                 "90_plus")
# Sex and resident type (columns)
CAS001.TYPES <- c("All", "M_Hhold", "F_Hhold", "M_Inst", "F_Inst")
# Column names
CAS001.COLS <- sprintf("%s.%s", rep(CAS001.TYPES, length(CAS001.AGES)),
                       sapply(CAS001.AGES, rep, length(CAS001.TYPES)) )

# The target banding 
CAS001.AGE.BANDS.EW <- list("0_15" = c("0_4", "5_9", "10_14", "15"),
                            "16_24" = c(16:19, "20_24"),
                            "25_34" = c("25_29", "30_34"),
                            "35_49" = c("35_39", "40_44", "45_49"),
                            "50_64" = c("50_54", "55_59", "60_64"),
                            "65_plus" = c("65_69", "70_74", "75_79", "80_84", "85_89", "90_plus") )

CAS001.AGE.BANDS.SC <- list("0_15" = c("0_4", "5_9", "10_14", "15"),
                            "16_24" = c(16:19, "20_24"),
                            "25_49" = c("25_29", "30_34", "35_39", "40_44", "45_49"),
                            "50_64" = c("50_54", "55_59", "60_64"),
                            "65_plus" = c("65_69", "70_74", "75_79", "80_84", "85_89", "90_plus") )

# Column names for CAS003 - sex by age by marital status - household reference persons
# Age categories (rows)
CAS003.AGES <- c("ALL", "19_under",
                 sprintf("%i_%i", seq(20,85,5), seq(24,89,5)),
                 "90_plus")

# HRP and marital status counts (columns)
CAS003.TYPES <- c("All_Tot", "All_Marr", "All_Sgl",
                  "M_Tot", "M_Marr", "M_Sgl",
                  "F_Tot", "F_Marr", "F_Sgl")
CAS003.COLS <- sprintf("%s.%s", rep(CAS003.TYPES, length(CAS003.AGES)),
                       sapply(CAS003.AGES, rep, length(CAS003.TYPES)) )

CAS003.AGE.BANDS.EW <- list("16_24" = c("19_under", "20_24"),
                         "25_34" = c("25_29", "30_34"),
                         "35_49" = c("35_39", "40_44", "45_49"),
                         "50_64" = c("50_54", "55_59", "60_64"),
                         "65_plus" = c("65_69", "70_74", "75_79", "80_84", "85_89", "90_plus") )

CAS003.AGE.BANDS.SC <- list("16_24" = c("19_under", "20_24"),
                            "25_49" = c("25_29", "30_34", "35_39", "40_44", "45_49"),
                            "50_64" = c("50_54", "55_59", "60_64"),
                            "65_plus" = c("65_69", "70_74", "75_79", "80_84", "85_89", "90_plus") )

# LC1105.EW - Sex and age by residence type (whether in communal establishment)
LC1105.EW.AGE.BANDS <- list("0_15" = c("0_4", "5_7", "8_9", "10_14", "15"),
                            "16_24" = c("16_17", "18_19", "20_24"),
                            "25_34" = c("25_29", "30_34"),
                            "35_49" = c("35_39", "40_44", "45_49"),
                            "50_64" = c("50_54", "55_59", "60_64"),
                            "65_plus" = c("65_69", "70_74", "75_79", "80_84", "85_plus") )
LC1105.EW.AGES <- c("All", unlist(LC1105.EW.AGE.BANDS))
LC1105.EW.RES.TYPES <- c("All", "Hhold", "Inst")
LC1105.EW.SEXES <- c("All", "M", "F")

LC1105.EW.COLS <- sprintf( "%s.%s.%s",
                       rep(LC1105.EW.SEXES, length(LC1105.EW.AGES) * length(LC1105.EW.RES.TYPES) ),
                       sapply(LC1105.EW.AGES, rep, length(LC1105.EW.SEXES) * length(LC1105.EW.RES.TYPES) ),
                       rep(sapply(LC1105.EW.RES.TYPES, rep, length(LC1105.EW.SEXES)), length(LC1105.EW.AGES) ) )

# LC1104.SC - The same information (including sex) is in the LC1104SC in
# Scotland, with slightly different age bands in source and in target
LC1104.SC.AGE.BANDS <- list("0_15" = c("0_4", "5_7", "8_9", "10_14", "15"),
                            "16_24" = c("16_17", "18_19", "20_24"),
                            "25_49" = c("25_29", "30_34", "35_39", "40_44", "45_49"),
                            "50_64" = c("50_54", "55_59", "60_64"),
                            "65_plus" = c("65_69", "70_74", "75_79", "80_84", "85_89", "90_94", "95_plus") )

LC1104.SC.AGES <- c("All", unlist(LC1104.SC.AGE.BANDS))

LC1104.SC.COLS <- sprintf("%s.%s.%s",
                          rep(LC1105.EW.SEXES, length(LC1104.SC.AGES) * length(LC1105.EW.RES.TYPES) ),
                          sapply(LC1104.SC.AGES, rep, length(LC1105.EW.SEXES) * length(LC1105.EW.RES.TYPES) ),
                          rep(sapply(LC1105.EW.RES.TYPES, rep, length(LC1105.EW.SEXES)), length(LC1104.SC.AGES) ) )

# LC1101.EW - Marital status by sex and age, household reference persons
LC1101.EW.AGE.BANDS <- list("16_24" = "24_under",
                            "25_34" = "25_34",
                            "35_49" = "35_49",
                            "50_64" = "50_64",
                            "65_plus" = "65_plus")
LC1101.EW.AGES <- c("All", unlist(LC1101.EW.AGE.BANDS) )
LC1101.EW.SEXES <- c("All", "M", "F")
LC1101.EW.MARITALS <- c("All", "Single", "Married", "CivPart", "Separated", "Divorced", "Widowed")

LC1101.EW.COLS <- sprintf("%s.%s.%s",
                          sapply(LC1101.EW.SEXES, rep, length(LC1101.EW.AGES) * length(LC1101.EW.MARITALS) ),
                          rep(sapply(LC1101.EW.AGES, rep, length(LC1101.EW.MARITALS)), length(LC1101.EW.SEXES) ),
                          rep(LC1101.EW.MARITALS, length(LC1101.EW.AGES) * length(LC1101.EW.SEXES) ) )

LC1101.SC.AGES <- list("Total" = "All",
                       "24 and under" = "16_24",
                       "25 to 49" = "25_49",
                       "50 to 64" = "50_64",
                       "65 and over" =  "65_plus")

LC1101.SC.SEXES <- list("All HRPs:"    = "All",
                        "Male HRPs:"   = "M",
                        "Female HRPs:" = "F")
