##This code imports LFS data from SPSS files and finds the employment rate and median salaries for graudates
#based on various characteristics
##
library("foreign")
library("plyr")
library("reshape2")
library("rlang")
library("spatstat")
library("dplyr")
library("lazyeval")
library("survey")
library("tidyr")

#######################################################
#           Functions used within the code            #
#######################################################

######################################################################################
#Add in the function to change SPSS data to R data
######################################################################################
#For the fifth edition (June 2017), the 2016 income and person weights were replaced with 2017 weights 
#(variables piwt17 and pwt17 from 2012 onwards

readLFS<-function(quarter, year){
  #Latest data
  test<-read.spss(paste0("LFS datasets/",year," Q",quarter,".sav"),to.data.frame=TRUE)
  
  #Select variables needed
  
  colnames<-c("COUNTRY","SEX","AGE", names(test[grep("PWT", names(test))]), 
              names(test[grep("PIWT", names(test))]),
              names(test[grep("HIQUAL", names(test))]),"HIGHO",
              names(test[grep("DEGREE", names(test))]),"SC2KMMJ",
              "SC10MMJ","ETH11EW",
              "SNGDEGB","SNGHD","FDSNGDEG","ILODEFR","DISEA","DEGCLS7",
              "FTPTWK","GRSSWK", "INDE07M","UALA","CASENO","GOVTOF2")
  
  #not all variables are in each dataset
  available_columns<-colnames[which(colnames %in% names(test))]
  
  test2<- test[,available_columns]
  
  
  
  #restrict to population in England aged 16-64
  test2<-test2[which(test2$COUNTRY=="England" & test2$AGE>15 & test2$AGE<65),]
  
  saveRDS(object = test2, file = paste0("LFS datasets/Q",quarter,"_",year,".rds"))
  
}


read_LFS_from_project<-function(quarter, year){
  
  temp <-readRDS(paste0("LFS datasets/Q",quarter,"_", year,".rds"))
  
  assign(paste0("Q",quarter,"_",year),temp,envir = globalenv())
}
######################################################################################
#Add in the function to recode variables in the LFS to those used in the publication
######################################################################################
recode_variables<- function(quarter, year){
  
  #fetch the dataset from the global environment: for example Q1_2016
  dataset<-get(paste0("Q",quarter,"_",year))
  
  
  #which HIQUAL variable is in use?
  #use 'highqual15' for LFS beyond 2014
  #For pre 2014 use HIQUAL11<- 2012-2014, HIQUAL5 2006-2011
  hiqual_var<-names(dataset[grep("HIQUAL", names(dataset))])[1]
  if(class(dataset[,c(hiqual_var)])=="numeric"){
    #add in error warning if not a factor/string variable
    paste0("Warning: dataset Q",quarter," ",year,"is missing factor levels in HIQUAL")
  }
  
  # Check if data is the right type, if not convert to a factor with the correct value levels.
  if("HIQUAL15" %in% names(dataset)& class(dataset$HIQUAL15)=="numeric"){ 
    dataset$HIQUAL15<- as.factor(dataset$HIQUAL15)
    levels(dataset$HIQUAL15)<-high_qual15
  } 
  
  ##Define Graduate Type
  #Define graduates, postgraduates and non-graduates 
  dataset$Graduate_Type<-NA  
  
  
  #Graduates are those with batchelor degrees only
  #For 2006 Degree type is set by DEGREE4, not DEGREE7
  if("DEGREE71" %in% names(dataset)){
    dataset$Graduate_Type[(dataset[,c(hiqual_var)]=="First degree/foundation degree"|
                             dataset[,c(hiqual_var)]=="First/Foundation degree"
    ) & (
      dataset$DEGREE71=="first degree" |
        dataset$DEGREE72=="first degree" |
        dataset$DEGREE73=="first degree" |
        dataset$DEGREE74=="first degree" |
        dataset$DEGREE75=="first degree" )]<-"Graduate"}
  
  if("DEGREE4" %in% names(dataset)){
    dataset$Graduate_Type[(dataset[,c(hiqual_var)]=="First degree/foundation degree"|
                             dataset[,c(hiqual_var)]=="First/Foundation degree"
    ) & (
      dataset$DEGREE4=="A first degree"  )]<-"Graduate"}
  
  
  
  # Define postgraduate qualification level
  #Note this includes 'Other Post Grad degree or professional qual' as in Annex A
  dataset$Graduate_Type[dataset[,c(hiqual_var)]=="Higher degree" 
                        & dataset$HIGHO != "Dont know"]<-"PostGrad"
  
  
  #Define those not in scope of this publication
  dataset$Graduate_Type[dataset[,c(hiqual_var)]=="Level 8 Diploma"|
                          dataset[,c(hiqual_var)]=="Level 8 Certificate"|
                          dataset[,c(hiqual_var)]=="Level 8 Award"|
                          dataset[,c(hiqual_var)]=="Level 7 Certificate"|
                          dataset[,c(hiqual_var)]=="Level 7 Diploma"|
                          dataset[,c(hiqual_var)]=='Level 7 Award'|
                          dataset[,c(hiqual_var)]=='Level 6 Award'|
                          dataset[,c(hiqual_var)]=='Level 6 Certificate'|
                          dataset[,c(hiqual_var)]=='Level 6 Diploma'|
                          dataset[,c(hiqual_var)]=="Don.t know"|
                          dataset[,c(hiqual_var)]=="Don't know"|
                          dataset[,c(hiqual_var)]=="Did not know"|
                          (dataset[,c(hiqual_var)]=="Higher degree" & dataset$HIGHO =="Dont know")|
                          (dataset[,c(hiqual_var)]=="Higher degree" & is.na(dataset$HIGHO))
                        ]<-"Not in scope"
  
  #Set everyone who isn't a graduate, post graduate or not in scope to a non graduate
  dataset$Graduate_Type[is.na(dataset$Graduate_Type)]<-"Non Grad"
  
  
  # AgeGroup.
  dataset$AgeGroup[dataset$AGE %in% 16:20]<-"16-20"
  dataset$AgeGroup[dataset$AGE %in% 21:30]<-"21-30"
  dataset$AgeGroup[dataset$AGE %in% 31:40]<-"31-40"
  dataset$AgeGroup[dataset$AGE %in% 41:50]<-"41-50"
  dataset$AgeGroup[dataset$AGE %in% 51:60]<-"51-60"
  dataset$AgeGroup[dataset$AGE %in% 61:64]<-"61-64"
  
  #The remaining variables are only added if they occur in the dataset
  
  if("SC10MMJ" %in% names(dataset)){
    
    #Define skilled occupation groupings
    SOCHE1<-c("1  'Managers, Directors And Senior Officials'"        
              ,"2  'Professional Occupations'"                        
              ,"3  'Associate Professional And Technical Occupations'")
    
    SOCHE2<-c("4  'Administrative And Secretarial Occupations'"      
              ,"5  'Skilled Trades Occupations'"                      
              ,"6  'Caring, Leisure And Other Service Occupations'"  )
    
    SOCHE3<-c("7  'Sales And Customer Service Occupations'"          
              ,"8  'Process, Plant And Machine Operatives'"           
              ,"9  'Elementary Occupations'"  )
    
    dataset$SOCHE[dataset$SC10MMJ %in% SOCHE1]<-"High Skilled Employment"
    dataset$SOCHE[dataset$SC10MMJ %in% SOCHE2]<-"Medium Skilled Employment"
    dataset$SOCHE[dataset$SC10MMJ %in% SOCHE3]<-"Low Skilled Employment"
    dataset$SOCHE[is.na(dataset$SOCHE)]<-"Else"
  }
  
  
  #Define skilled occupation groupings- name of variable changes in earlier datasets
  if("SC2KMMJ" %in% names(dataset)){
    SOCHE1<-c("1  'Managers and Senior Officials'"         
              ,"2  'Professional occupations'"                           
              ,"3  'Associate Professional and Technical'" )
    
    SOCHE2<-c("4  'Administrative and Secretarial'"       
              ,"5  'Skilled Trades Occupations'"                     
              ,"6  'Personal Service Occupations'"  )
    
    SOCHE3<-c("7  'Sales and Customer Service Occupations'"          
              ,"8  'Process, Plant and Machine Operatives'"           
              ,"9  'Elementary Occupations'"  )
    
    dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE1]<-"High Skilled Employment"
    dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE2]<-"Medium Skilled Employment"
    dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE3]<-"Low Skilled Employment"
    dataset$SOCHE[is.na(dataset$SOCHE)]<-"Else"
  }
  
  
  
  #Define ethnicity groups
  if("ETH11EW" %in% names(dataset)){
    dataset$ETHNICITY<-dataset$ETH11EW
    
    dataset$ETHNICITY[dataset$ETH11EW %in% c("Mixed / Multiple ethnic groups" ,"Chinese","Arab")]<-"Other ethnic group."   
  }
  
  #Define Subject Type
  if("SNGDEGB" %in% names(dataset)){
    dataset$STEM[dataset$SNGDEGB %in% 1:9]<- 1
    #10.1~Economics, Law, Business
    dataset$LEM[dataset$SNGDEGB %in% 11:12| (substr(dataset$FDSNGDEG,1,4)=="10.1" 
                                             & dataset$Graduate_Type=="Graduate")|
                  (substr(dataset$SNGHD,1,4)=="10.1" 
                   & dataset$Graduate_Type=="PostGrad")]<-1
    #10.0~Social Studies
    #10.2:10.9~Politics, Sociology, Social Policy, Social Work,Anthropology, 
    #Human and Social Geography, Other Social Studies
    
    dataset$OSSAH[dataset$SNGDEGB %in% 13:19|  
                    (substr(dataset$FDSNGDEG,1,4) %in% c("10.0","10.2","10.3","10.4","10.5",
                                                         "10.6","10.7","10.8","10.9")
                     & dataset$Graduate_Type=="Graduate")|
                    (substr(dataset$SNGHD,1,4) %in% c("10.0","10.2","10.3","10.4","10.5",
                                                      "10.6","10.7","10.8","10.9")
                     & dataset$Graduate_Type=="PostGrad")]<-1
  }
  
  #Define Occupation.
  if("SC10MMJ" %in% names(dataset)){
    dataset$Occupation<-dataset$SC10MMJ
    dataset$Occupation<-factor(dataset$Occupation, levels=c(levels(dataset$Occupation),"10 'Medium Skilled Employment'","11 'Low Skilled Employment'"))
    dataset$Occupation[dataset$SC10MMJ %in% c("4  'Administrative And Secretarial Occupations'",      
                                              "5  'Skilled Trades Occupations'",
                                              "6  'Caring, Leisure And Other Service Occupations'")]<-"10 'Medium Skilled Employment'"
    dataset$Occupation[dataset$SC10MMJ %in% c("7  'Sales And Customer Service Occupations'" , "8  'Process, Plant And Machine Operatives'"           
                                              ,"9  'Elementary Occupations'" )]<-"11 'Low Skilled Employment'"
  }
  
  #Labels Occupation 					
  #1 'Managers, Directors and Senior Officials'
  #2 'Professional Occupations'
  #3	'Associate Professional and Technical Occupations'		
  #10	'Medium Skilled Employment'							
  #11	'Low Skilled Employment'.
  
  
  assign(paste0("Q",quarter,"_",year),dataset,envir = globalenv())
  
}

#########################################################################################
#                           Employment rate by demographic                              #
#########################################################################################

#Headline stats:

#for 16-64s
Headline_stats_function<-function(all,agebracket){
  #subset data if needed
  
  if(agebracket=="16-64"){all<- all}else{all<-all[which(all$AgeGroup=="21-30"),]}
  
  all %>%
    group_by(Graduate_Type)%>% summarise(total=sum(freq),sample=sum(sample))->full_pop
  
  types<-full_pop["Graduate_Type"]
  
  all[which(all$ILODEFR=="In employment"),] %>%
    group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
    merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->in_employment
  
  all[which(all$ILODEFR=="In employment" & all$SOCHE=="High Skilled Employment"),] %>%
    group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
    merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->in_HSemployment
  
  all[which(all$ILODEFR=="ILO unemployed"),] %>%
    group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
    merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->unemployed
  
  all[which(all$ILODEFR=="Inactive"),] %>%
    group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
    merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->inactive
  
  
  Headline_stats<-data.frame( in_employment$Graduate_Type,
                              in_employment$count/full_pop$total, 
                              in_HSemployment$count/full_pop$total,
                              unemployed$count/(unemployed$count + in_employment$count),
                              inactive$count/full_pop$total,
                              in_employment$sample,
                              in_HSemployment$sample,
                              unemployed$sample,
                              inactive$sample,
                              in_employment$count,
                              in_HSemployment$count,
                              unemployed$count,
                              inactive$count)
  
  
  Headline_stats$Age_range<-agebracket
  
  colnames(Headline_stats)<-c("Population","Employment_rate","High_skill_emp_rate","Unemployment_rate","Inactivity_rate",
                              "Employed sample","HS employed sample", "unemployed sample", "inactive sample","employed total","HS employed total",
                              "unemployed total","inactive total","Age_range")
  
  Headline_stats<-Headline_stats[c("Age_range","Population","Employment_rate","High_skill_emp_rate","Unemployment_rate","Inactivity_rate",
                                   "Employed sample","HS employed sample", "unemployed sample", "inactive sample","employed total","HS employed total",
                                   "unemployed total","inactive total")]
}


Employment_rate<-function(quarter, year){
  
  
  #get the dataset from the global environment
  dataset<-get(paste0("Q",quarter,"_",year))
  #find the weight variable
  weight<-names(dataset[grep("PWT", names(dataset))])[1]
  dataset<-dataset[which(dataset[,weight]>0),]
  # count all people by age, graduate type, ILO employment status and SOC group: weight people by the weight variable  (such as PWT16)
  #also note sample size
  all<-dataset%>%
    group_by(AgeGroup,Graduate_Type,ILODEFR,SOCHE) %>%
    summarise(freq=sum(!!sym(weight)), sample=n())
  
  
  Headline_stats<-Headline_stats_function(all,"16-64")
  
  #Headline stats for 21-30s
  Headline_stats_young<-Headline_stats_function(all,"21-30")
  
  headline_stats<-rbind(Headline_stats,Headline_stats_young)
  
  assign(paste0("Headline_Q",quarter,"_",year),headline_stats,envir = globalenv())
  
  #write.csv(headline_stats,paste0("Outputs for GLMS/Headline_Q",quarter,"_",year,".csv"))
  
}

#combine employment rates for each quarter into a yearly mean average
year_average_employment_rate<-function(year){
  
  #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset'
  dataset1<-get(paste0("Headline_Q1_",year))
  dataset2<-get(paste0("Headline_Q2_",year))
  dataset3<-get(paste0("Headline_Q3_",year))
  dataset4<-get(paste0("Headline_Q4_",year))
  
  year_data<-rbind(dataset1,dataset2,dataset3,dataset4)
  
  year_data%>%
    group_by( Age_range, Population) %>%
    summarise(Employment_rate=mean.default(Employment_rate), 
              High_skill_emp_rate= mean.default(High_skill_emp_rate),
              Unemployment_rate=mean.default(Unemployment_rate), 
              inactivity_rate= mean.default(Inactivity_rate),
              employed_sample=sum(`Employed sample`), 
              HS_employed_sample= sum(`HS employed sample`),
              unemployed_sample=sum(`unemployed sample`), 
              inactive_sample=sum(`inactive sample`),
              employed_total=sum(`employed total`), 
              HS_employed_total=sum(`HS employed total`),
              unemployed_total=sum(`unemployed total`), 
              inactive_total=sum(`inactive total`)) -> year_average
  
  year_average$year<-year
  
  assign(paste0("Headline_",year),year_average,envir = globalenv())
  #write.csv(year_average,paste0("Outputs for GLMS/Headline_",year,".csv"))
  
}

Employment_counts_by<-function(dataset,demographic){
  weight<-names(dataset[grep("PWT", names(dataset))])[1]
  as.data.frame(
    dataset%>%
      group_by_(demographic,"Graduate_Type","ILODEFR","SOCHE")%>%
      dplyr::summarise(sample=n(),
                       population=sum(!!sym(weight))))
}

variable_splits<-function(dataset, variable_y){
  
  dataset %>%
    group_by_(variable_y)%>% summarise(total=sum(population))->full_pop
  
  dataset[which(dataset$ILODEFR=="In employment"),] %>%
    group_by_(variable_y)%>% summarise(employed.count=sum(population), employed.sample=sum(sample))->in_employment
  
  dataset[which(dataset$ILODEFR=="In employment" & dataset$SOCHE=="High Skilled Employment"),] %>%
    group_by_(variable_y)%>% summarise(HSemp.count=sum(population),HSemp.sample=sum(sample))->in_HSemployment
  
  dataset[which(dataset$ILODEFR=="ILO unemployed"),] %>%
    group_by_(variable_y)%>% summarise(Unemp.count=sum(population),Unemp.sample=sum(sample))->unemployed
  
  dataset[which(dataset$ILODEFR=="Inactive"),] %>%
    group_by_(variable_y)%>% summarise(Inactive.count=sum(population),Inactive.sample=sum(sample))->inactive
  
  list<-list(full_pop,in_employment,in_HSemployment, unemployed,inactive)
  
  total_count<-Reduce(function(x, y) merge(x, y, all.x=T, 
                                           by=c(variable_y)), list, accumulate=F)
  
  
  
  Graduate_breakdown<-data.frame( total_count[variable_y],
                                  total_count$employed.count/total_count$total, 
                                  total_count$employed.sample,
                                  total_count$HSemp.count/total_count$total, 
                                  total_count$HSemp.sample,
                                  total_count$Unemp.count/(total_count$Unemp.count + total_count$employed.count),
                                  total_count$Unemp.sample,
                                  total_count$Inactive.count/total_count$total,
                                  total_count$Inactive.sample)
  
  colnames(Graduate_breakdown)<-c("Breakdown","Employment_rate","Employed_sample",
                                  "High_skill_emp_rate","High_Skill_employed_sample",
                                  "Unemployment_rate","Unemployed_sample",
                                  "Inactivity_rate","Inactive_sample")
  
  return(Graduate_breakdown)
}

#provide employment rates for the year broken down by characteristic
Graduate_breakdown<-function(quarter,year){
  
  #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset'
  dataset<-get(paste0("Q",quarter,"_",year))
  
  #reduce dataset to just graduates
  dataset<-dataset[which(dataset$Graduate_Type=="Graduate"),]
  
  Employment_counts_by(dataset,"AgeGroup")%>%  variable_splits(.,"AgeGroup")->Age_group_rates
  Age_group_rates$Population<-"AgeGroup"
  
  Employment_counts_by(dataset,"SEX") %>% variable_splits(.,"SEX")->Gender_rates
  Gender_rates$Population<-"SEX"
  
  Employment_counts_by(dataset,"ETHNICITY") %>% variable_splits(.,"ETHNICITY")->Ethnicity_rates
  Ethnicity_rates$Population<-"ETHNICITY"
  
  Employment_counts_by(dataset,"DISEA") %>% variable_splits(.,"DISEA")->Disability_rates
  Disability_rates$Population<-"DISEA"
  
  Employment_counts_by(dataset,"DEGCLS7") %>% variable_splits(.,"DEGCLS7")->Degree_class_rates
  Degree_class_rates$Population<-"DEGCLS7"
  
  Employment_counts_by(dataset,"STEM") %>% variable_splits(.,"STEM")->STEM_rates
  STEM_rates$Population<-"STEM"
  
  Employment_counts_by(dataset,"LEM") %>% variable_splits(.,"LEM")->LEM_rates
  LEM_rates$Population<-"LEM"
  
  Employment_counts_by(dataset,"OSSAH") %>% variable_splits(.,"OSSAH")->OSSAH_rates
  OSSAH_rates$Population<-"OSSAH"
  
  Employment_counts_by(dataset,"GOVTOF2") %>% variable_splits(.,"GOVTOF2")->Region_rates
  Region_rates$Population<-"GOVTOF2"
  
  quarter<-rbind(Age_group_rates,Gender_rates,Ethnicity_rates, Disability_rates, Degree_class_rates,
                 STEM_rates, LEM_rates, OSSAH_rates,Region_rates)
  
  quarter$Group<-"Graduates:16-64"
  
  #reduce dataset to just young graduates
  dataset<-dataset[which(dataset$Graduate_Type=="Graduate" & dataset$AgeGroup=="21-30"),]
  
  
  Employment_counts_by(dataset,"SEX") %>% variable_splits(.,"SEX")->Gender_rates
  Gender_rates$Population<-"SEX"
  
  Employment_counts_by(dataset,"ETHNICITY") %>% variable_splits(.,"ETHNICITY")->Ethnicity_rates
  Ethnicity_rates$Population<-"ETHNICITY"
  
  Employment_counts_by(dataset,"DISEA") %>% variable_splits(.,"DISEA")->Disability_rates
  Disability_rates$Population<-"DISEA"
  
  Employment_counts_by(dataset,"DEGCLS7") %>% variable_splits(.,"DEGCLS7")->Degree_class_rates
  Degree_class_rates$Population<-"DEGCLS7"
  
  Employment_counts_by(dataset,"STEM") %>% variable_splits(.,"STEM")->STEM_rates
  STEM_rates$Population<-"STEM"
  
  Employment_counts_by(dataset,"LEM") %>% variable_splits(.,"LEM")->LEM_rates
  LEM_rates$Population<-"LEM"
  
  Employment_counts_by(dataset,"OSSAH") %>% variable_splits(.,"OSSAH")->OSSAH_rates
  OSSAH_rates$Population<-"OSSAH"
  
  Employment_counts_by(dataset,"GOVTOF2") %>% variable_splits(.,"GOVTOF2")->Region_rates
  Region_rates$Population<-"GOVTOF2"
  
  
  quarter2<-rbind(Age_group_rates,Gender_rates,Ethnicity_rates, Disability_rates, Degree_class_rates,
                  STEM_rates, LEM_rates, OSSAH_rates,Region_rates)
  
  quarter2$Group<-"Graduates:21-30"
  
  quarter<-rbind(quarter,quarter2)
  return(quarter)
  
}
Graduate_breakdown_year<-function(year){
  #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset1'
  dataset1<-Graduate_breakdown(1,year)
  dataset2<-Graduate_breakdown(2,year)
  dataset3<-Graduate_breakdown(3,year)
  dataset4<-Graduate_breakdown(4,year)
  
  year_data<- rbind(dataset1,dataset2,dataset3,dataset4)
  
  means<-ddply(year_data, .(Group,Population, Breakdown), colwise(mean, .(Employment_rate, 
                                                                          High_skill_emp_rate, 
                                                                          Unemployment_rate, Inactivity_rate),na.rm=TRUE))
  
  samples<-ddply(year_data, .(Group,Population, Breakdown),colwise(sum, .(Employed_sample, 
                                                                          High_Skill_employed_sample, 
                                                                          Unemployed_sample, Inactive_sample),na.rm=TRUE))
  
  output<-merge(x=means, y=samples, by=c("Group","Population","Breakdown"), all=TRUE)
  
  output<-output[c("Group",
                   "Population",
                   "Breakdown",
                   "Employment_rate",
                   "Employed_sample",
                   "High_skill_emp_rate",
                   "High_Skill_employed_sample",
                   "Unemployment_rate",
                   "Unemployed_sample",
                   "Inactivity_rate",
                   "Inactive_sample")]
  
  assign(paste0("Graduate_breakdown_",year),output,envir = globalenv())
  write.csv(output,paste0("Outputs for GLMS/Graduate_breakdown_",year,".csv"))
  
}

#########################################################################################
#                           Average Salary by demographic                              #
#########################################################################################

#Find the median salaries by demographic characteristic, and reshape (melt) data to have the 
#characteristic as a variable rather than column name

Average_salary_by<-function(dataset,demographic){
  #find the weight variable
  weight<-names(dataset[grep("PIWT", names(dataset))])[1]
  
  as.data.frame(
    dataset%>%
      group_by_(demographic)%>%
      dplyr::summarise(median=weighted.median(GRSSWK,!!sym(weight)),
                       sample=n(),
                       population=sum(!!sym(weight))))%>%
    melt(id=c("median","sample","population"))
  #take a dataset; based on the demographic characeristic separate the dataet into groups; find the
  #median of GRSSWK (gross weekly pay) and weight on PIWT17 (where approriate)
  #reshape dataset for ease of output- turn the demographic into a variable rather than a column heading
}


Average_Salaries<-function(year){
  
  #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset'
  dataset1<-get(paste0("Q1_",year))
  dataset2<-get(paste0("Q2_",year))
  dataset3<-get(paste0("Q3_",year))
  dataset4<-get(paste0("Q4_",year))
  
  #rename weight variable
  reformat_dataset<-function(dataset){
    colnames<-c(names(dataset[grep("PIWT", names(dataset))][1]),
                "Graduate_Type",
                "AgeGroup",
                "ILODEFR",
                "FTPTWK",
                "GRSSWK")
    
    dataset<- dataset[,colnames]
    
    weight_var<-names(dataset[grep("PIWT", names(dataset))])[1]
    weight_index<-match(colnames(dataset),weight_var,0)
    colnames(dataset)[colnames(dataset) %in% weight_var]  <- c("PIWT")
    return(dataset)
  }
  
  dataset1<-reformat_dataset(dataset1)
  dataset2<-reformat_dataset(dataset2)
  dataset3<-reformat_dataset(dataset3)
  dataset4<-reformat_dataset(dataset4)
  
  
  dataset<-rbind(dataset1, dataset2, dataset3, dataset4)
  
  #Restrict to just those in full time employment
  dataset<-dataset[which(dataset$FTPTWK=="Full-time" & dataset$ILODEFR=="In employment"),]
  
  #find the weight variable for the dataset
  weight<-names(dataset[grep("PIWT", names(dataset))])[1]
  
  #restrict to just those with an income weight. Those with 0 weight wouldn't be included in the weighted median anyway, but removing at this stage speeds up computation.
  dataset<-dataset[which(dataset[,weight]!=0),]
  
  
  #################################################
  #     Gross Weekly pay in main job-             #
  #################################################
  
  
  ##Split by Age and Graduate Type
  all<-as.data.frame(
    dataset%>%
      group_by(AgeGroup, Graduate_Type)%>%
      dplyr::summarise(median=weighted.median(GRSSWK,!!sym(weight)),
                       sample=n(),
                       population=sum(!!sym(weight))))
  
  #Just split by graduate type (i.e. across all agebands)
  #Use the function just defined above
  all_total<-  Average_salary_by(dataset,"Graduate_Type")
  
  #Add in an 'agegroup' for the average weekly salaries by graduate type and rename the column headers: 
  #this way we can merge the datasets as they'll have the same headers
  all_total$variable<-"Total"
  names(all_total)<-c("median",
                      "sample",
                      "population",
                      "AgeGroup",
                      "Graduate_Type")
  
  #append the average salaries for people by graduate type and ageband with those for all agebands.
  all_sal<-rbind(all, all_total)
  all_sal$year<-year
  
  #rename the young_grad_sal dataset to reference the year and quarter it's run on and output to the global environment.
  assign(paste0("Sal_",year),all_sal,envir = globalenv())
  
}


#############################################################################################
#                 Restrict to just graduates: Graduate Breakdown salaries                                                #
#                 Most recent year only
#############################################################################################
Graduate_breakdown_salaries<-function(year){
  
  #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset1'
  dataset1<-get(paste0("Q1_",year))
  dataset2<-get(paste0("Q2_",year))
  dataset3<-get(paste0("Q3_",year))
  dataset4<-get(paste0("Q4_",year))
  
  dataset<-rbind(dataset1, dataset2, dataset3, dataset4)
  
  #Restrict to just those in full time employment
  dataset<-dataset[which(dataset$FTPTWK=="Full-time" & dataset$ILODEFR=="In employment"),]
  
  #find the weight variable for the dataset
  weight<-names(dataset[grep("PIWT", names(dataset))])[1]
  
  #restrict to just those with an income weight. Those with 0 weight wouldn't be included in the weighted median anyway, but removing at this stage speeds up computation.
  dataset<-dataset[which(dataset[,weight]!=0),]
  
  
  #restrict to just graduates
  dataset<-dataset[which(dataset$Graduate_Type=="Graduate"),]
  
  #Average salary by ageband
  age<-Average_salary_by(dataset,"AgeGroup")
  
  #Repeat for average salary by gender
  sex<-Average_salary_by(dataset,"SEX")
  
  #Average salary by ethnicity
  ethnicity<-Average_salary_by(dataset,"ETHNICITY")
  
  #Average salary by disability status
  
  disability<-Average_salary_by(dataset,"DISEA")
  
  #Average salary by degree classification
  degreeclass<-Average_salary_by(dataset,"DEGCLS7")
  
  #Average salary by STEM/non-STEM graduates
  stem<-Average_salary_by(dataset,"STEM")
  
  #Average salary by LEM/non-LEM graduates
  lem<-Average_salary_by(dataset,"LEM")
  #Average salary by OSSAH/non-OSSAH graduates
  ossah<-Average_salary_by(dataset,"OSSAH")
  
  #Average salary by occupational group
  occupation<-Average_salary_by(dataset,"Occupation")
  
  #Average salary by industry category
  industry<-Average_salary_by(dataset,"INDE07M")
  
  #Average salary by region 
  region<-Average_salary_by(dataset,"GOVTOF2")
  
  #append datasets to each other to create one dataset of graduate salaries for different demographic characteristics
  grad_sal<-rbind(age, 
                  sex,
                  ethnicity,
                  disability, 
                  degreeclass, 
                  stem, 
                  lem,
                  ossah,
                  occupation, 
                  industry,
                  region)
  
  
  #rename the grad_sal dataset to reference the year and quarter it's run on and output to the global environment.
  assign(paste0("grad_sal_",year),grad_sal,envir = globalenv())
  
  write.csv(grad_sal,paste0("Outputs for GLMS/GRAD_SAL_",year,".csv"))
  
  #########################################################
  #         Repeat but just for young graduates(21-30)   #
  #########################################################
  dataset<-dataset[which(dataset$Graduate_Type=="Graduate" & dataset$AgeGroup=="21-30"),]
  
  #Average salary by ageband
  age<-Average_salary_by(dataset,"AgeGroup")
  
  #Repeat for average salary by gender
  sex<-Average_salary_by(dataset,"SEX")
  
  #Average salary by ethnicity
  ethnicity<-Average_salary_by(dataset,"ETHNICITY")
  
  #Average salary by disability status
  
  disability<-Average_salary_by(dataset,"DISEA")
  
  #Average salary by degree classification
  degreeclass<-Average_salary_by(dataset,"DEGCLS7")
  
  #Average salary by STEM/non-STEM graduates
  stem<-Average_salary_by(dataset,"STEM")
  
  #Average salary by LEM/non-LEM graduates
  lem<-Average_salary_by(dataset,"LEM")
  #Average salary by OSSAH/non-OSSAH graduates
  ossah<-Average_salary_by(dataset,"OSSAH")
  
  #Average salary by occupational group
  occupation<-Average_salary_by(dataset,"Occupation")
  
  #Average salary by industry category
  industry<-Average_salary_by(dataset,"INDE07M")
  
  #Average salary by region 
  region<-Average_salary_by(dataset,"GOVTOF2")
  
  #append datasets to each other to create one dataset of graduate salaries for different demographic characteristics
  young_grad_sal<-rbind(age, 
                        sex, 
                        ethnicity,
                        disability,
                        degreeclass,
                        stem, 
                        lem, 
                        ossah,
                        occupation, 
                        industry,
                        region)
  
  #rename the young_grad_sal dataset to reference the year and quarter it's run on and output to the global environment.
  assign(paste0("young_grad_sal_",year),young_grad_sal,envir = globalenv())
  #Also write to an Excel file for ease of use by economics team.
  
  write.csv(young_grad_sal,paste0("Outputs for GLMS/YOUNG_GRAD_SAL_",year,".csv"))
}



#######################################################
#           1. Import data                            #
#######################################################


start_year=2006 #Which year do you want to use LFS data from?
end_year=2017 #Which year do you want the LFS data up to?

#### OPTIONAL STEP###
#Convert all LFS data stored in SPSS format from Q1 of the start year to Q4 of the end year into R data format
#unless this has been previously completed and files are in your R working directory. In which case skip this step.

#sapply(start_year:end_year, function(y)sapply(1:4,function(x)readLFS(x,y)))

#Note data is sent to a file LFS datasets within your project file: make sure the file is created

#####################

#After Import/If you have the .rds files already run
  sapply(start_year:end_year, function(y)sapply(1:4, function(x) read_LFS_from_project(x,y)))
          

#use the attributes of Q1_2016 to recode the numbers as text strings if HIQUAL15 isn't stored as a factor
high_qual15<-attributes(Q1_2016$HIQUAL15)[[1]]

#Recode the LFS variables to those used in the publication
sapply(start_year:end_year, function(y)sapply(1:4, function(x) recode_variables(x,y)))

#Output employment rates
#This outputs the proprotion in employment for each quarter. 
sapply(start_year:end_year, function(y)sapply(1:4, function(x) Employment_rate(x,y)))

#Create headline statistics for each year (mean average of rates in each quarter)
sapply(start_year:end_year,function(x)year_average_employment_rate(x))
list<-lapply(start_year:end_year,function(x)get(paste0("Headline_",x)))

Yearly_employment<-Reduce(rbind,list)

write.csv(Yearly_employment,"Outputs for GLMS/Headline_employment_rates.csv")

#Create the graduate breakdown for the latest year
Graduate_breakdown_year(end_year)

#Output weekly salaries
#This uses the data for the whole year to output median salaries
sapply(start_year:end_year, function(y)Average_Salaries(y))

list<-lapply(start_year:end_year,function(x)get(paste0("Sal_",x)))

Yearly_salaries<-Reduce(rbind,list)

write.csv(Yearly_salaries,"Outputs for GLMS/Yearly_salaries.csv")

#And salaries for the graduate breakdowns
Graduate_breakdown_salaries(end_year)

#######################################
#And confidence intervals
#######################################

# See link below for a discussion of accounting for survey design and weighting
# https://www.ons.gov.uk/methodology/methodologicalpublications/generalmethodology/onsworkingpaperseries/onsmethodologyworkingpaperseriesno9guidetocalculatingstandarderrorsforonssocialsurveys#accounting-for-the-estimation-method


options(survey.lonely.psu="adjust") #lonley PSUs make no contribution to the variance.

confidence_intervals<-function(quarter,year,young){
  dataset_name<-paste0("Q",quarter,"_",year)
  dataset<-get(dataset_name)
  
  #find the weight variable
  weight<-names(dataset[grep("PWT", names(dataset))])[1]
  
  dataset<-dataset[which(dataset[,weight]>=0),]
  
  #subest to 21-30s if young=TRUE
  if(young==TRUE){dataset<-dataset[which(dataset$AgeGroup=="21-30"),]}
  
  dataset$hhld_id<-substring(dataset$CASENO,1,13)
  
  #define graduates, post graduates and non_graduates
  dataset$Grad<-0
  dataset$Grad[dataset$Graduate_Type=="Graduate"]<-1
  
  dataset$PostGrad<-0
  dataset$PostGrad[dataset$Graduate_Type=="PostGrad"]<-1
  
  dataset$NonGrad<-0
  dataset$NonGrad[dataset$Graduate_Type=="Non Grad"]<-1
  
  #define employed graduates, post graduate and non graduates
  dataset$Emp_Grad<-0
  dataset$Emp_Grad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Graduate"]<-1
  
  dataset$Emp_PostGrad<-0
  dataset$Emp_PostGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="PostGrad"]<-1
  
  dataset$Emp_NonGrad<-0
  dataset$Emp_NonGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Non Grad"]<-1
  
  #define graduates, post graduates and non graduates in high skilled employment
  
  dataset$HSEmp_Grad<-0
  dataset$HSEmp_Grad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Graduate"& 
                       dataset$SOCHE=="High Skilled Employment"]<-1
  
  dataset$HSEmp_PostGrad<-0
  dataset$HSEmp_PostGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="PostGrad"& 
                           dataset$SOCHE=="High Skilled Employment"]<-1
  
  dataset$HSEmp_NonGrad<-0
  dataset$HSEmp_NonGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Non Grad"& 
                          dataset$SOCHE=="High Skilled Employment"]<-1
  
  #define Unemployed graduates, post graduate and non graduates
  dataset$Unemp_Grad<-0
  dataset$Unemp_Grad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="Graduate"]<-1
  
  dataset$Unemp_PostGrad<-0
  dataset$Unemp_PostGrad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="PostGrad"]<-1
  
  dataset$Unemp_NonGrad<-0
  dataset$Unemp_NonGrad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="Non Grad"]<-1
  
  #define economically active graduates, post graduate and non graduates
  dataset$Act_Grad<-0
  dataset$Act_Grad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment")
                   & dataset$Graduate_Type=="Graduate"]<-1
  
  dataset$Act_PostGrad<-0
  dataset$Act_PostGrad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment") 
                       & dataset$Graduate_Type=="PostGrad"]<-1
  
  dataset$Act_NonGrad<-0
  dataset$Act_NonGrad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment")
                      & dataset$Graduate_Type=="Non Grad"]<-1
  
  
  #dataset<<-dataset
  #Define the survey design:
  #clusters- households
  #strata- low level area identifier (Unitary Local Authoritary)
  #weight- person weight
  weight_formula<-as.formula(paste0("~",weight))
  
  surveydesign<-svydesign(id=~hhld_id, data=dataset, strata=~UALA, weight=weight_formula,nest=TRUE)
  #surveydesign<<-surveydesign
  
  #work out the standard error for graduates
  #Note the programe saves the variance = SE^2
  
  ratio_and_confidence_int<-function(Numerator_input, Demoninator_input){
    Numerator<-as.formula(paste0("~",Numerator_input))
    Denominator<-as.formula(paste0("~",Demoninator_input))
    #calcualte mean and standard error
    ratio_output<-svyratio(numerator=Numerator,denominator=Denominator, design = surveydesign)
    #calcualte confidence intervals
    confidence_int<-confint(ratio_output,  level = 0.95,df =degf(surveydesign))
    #reformat to dataframe
    ratio_output<-data.frame(matrix(unlist(ratio_output),nrow=1, byrow=T),stringsAsFactors = FALSE)
    confidence_int<-data.frame(matrix(unlist(confidence_int),nrow=1, byrow=T),stringsAsFactors = FALSE)
    #We sqare root to get back to the standard error
    ratio_output$X2<-sqrt(as.numeric(as.character(ratio_output$X2)))
    
    #summarise as 1 dataframe
    output<-data.frame(cbind(ratio_output$X1, confidence_int, ratio_output$X2),stringsAsFactors = FALSE)
    
    numerator_type<-strsplit(Numerator_input,"_")[[1]][1]
    
    
    names(output)<-c(paste0(numerator_type,"_prop"),paste0(numerator_type,"_prop_lower"),
                     paste0(numerator_type,"_prop_upper"),paste0(numerator_type,"_SE"))
    
    output$population<-strsplit(Numerator_input,"_")[[1]][2]
    output$Quarter<-dataset_name
    
    return(output)
  }
  #graduates
  grad_emp<-ratio_and_confidence_int("Emp_Grad","Grad")
  grad_HSemp<-ratio_and_confidence_int("HSEmp_Grad","Grad")
  grad_Unemp<-ratio_and_confidence_int("Unemp_Grad","Act_Grad")
  
  list<-list(grad_emp, grad_HSemp, grad_Unemp)
  
  grad<-Reduce(function(x, y) merge(x, y, all.x=T, 
                                    by=c("population","Quarter")), list, accumulate=F)
  
  #postgraduates
  PostGrad_emp<-ratio_and_confidence_int("Emp_PostGrad","PostGrad")
  PostGrad_HSemp<-ratio_and_confidence_int("HSEmp_PostGrad","PostGrad")
  PostGrad_Unemp<-ratio_and_confidence_int("Unemp_PostGrad","Act_PostGrad")
  
  list<-list(PostGrad_emp, PostGrad_HSemp, PostGrad_Unemp)
  
  PostGrad<-Reduce(function(x, y) merge(x, y, all.x=T, 
                                        by=c("population","Quarter")), list, accumulate=F)
  
  #Non graduates
  NonGrad_emp<-ratio_and_confidence_int("Emp_NonGrad","NonGrad")
  NonGrad_HSemp<-ratio_and_confidence_int("HSEmp_NonGrad","NonGrad")
  NonGrad_Unemp<-ratio_and_confidence_int("Unemp_NonGrad","Act_NonGrad")
  
  list<-list(NonGrad_emp, NonGrad_HSemp, NonGrad_Unemp)
  
  NonGrad<-Reduce(function(x, y) merge(x, y, all.x=T, 
                                       by=c("population","Quarter")), list, accumulate=F)
  
  
  ratios<-rbind(grad,PostGrad,NonGrad)
  
  #rename to keep in sync with Graduate_Types
  ratios$population[ratios$population=="Grad"]<-"Graduate"
  ratios$population[ratios$population=="NonGrad"]<-"Non Grad"
  
  
  #Also want the sample and population sizes 
  dataset %>%
    group_by(ILODEFR,Graduate_Type)%>% 
    summarise(count=n(),weighted_pop=sum(!!sym(weight))) %>% 
    gather(variable, value, -(Graduate_Type:ILODEFR)) %>%
    unite(temp, ILODEFR, variable) %>%
    spread(temp, value)->samples
  
  dataset[which(dataset$SOCHE=="High Skilled Employment" & dataset$ILODEFR=="In employment"),] %>%
    group_by(Graduate_Type)%>% 
    summarise(HS_emp_count=n(),HS_emp_weighted_pop=sum(!!sym(weight))) ->sample2
  
  samples<-merge(x=samples, y=sample2, by=("Graduate_Type"))
  
  colnames(samples)[colnames(samples)=="Graduate_Type"]<-"population"
  
  ratios<-merge(x=ratios, y=samples, by=c("population"), all.x=TRUE)
  
  return(ratios)
  
}

#Find confidence intervals from start year to end year
output<-lapply(start_year:end_year, function(y){lapply(1:4,function(x)confidence_intervals(x,y,FALSE))})


#store as dataframe
output2<-Reduce(rbind,lapply(1:length(output),function(x)Reduce(rbind, output[[x]])))

output2$year<-substring(output2$Quarter,4,7)
output2$Quarter<-substring(output2$Quarter,2,2)

output2<-output2[c("population","year",names(output2)[2:22])]

#sort by population and quarter
output3<-output2[with(output2, order(population, year, Quarter)), ]
output3$age<-"16-64"

#repeat for young_graduates

#Find confidence intervals from start year to end year
output<-lapply(start_year:end_year, function(y){lapply(1:4,function(x)confidence_intervals(x,y,TRUE))})


#store as dataframe
output2<-Reduce(rbind,lapply(1:length(output),function(x)Reduce(rbind, output[[x]])))

output2$year<-substring(output2$Quarter,4,7)
output2$Quarter<-substring(output2$Quarter,2,2)

output2<-output2[c("population","year",names(output2)[2:22])]

#sort by population and quarter
output4<-output2[with(output2, order(population, year, Quarter)), ]
output4$age<-"21-30"

output5<-rbind(output3,output4)

write.csv(output5,paste0("Outputs for GLMS/Emp_confidence_Intervals.csv"))