##This code imports LFS data from SPSS files and finds the employment rate and median salaries for graudates #based on various characteristics ## library("foreign") library("plyr") library("reshape2") library("rlang") library("spatstat") library("dplyr") library("lazyeval") library("survey") library("tidyr") ####################################################### # Functions used within the code # ####################################################### ###################################################################################### #Add in the function to change SPSS data to R data ###################################################################################### #For the fifth edition (June 2017), the 2016 income and person weights were replaced with 2017 weights #(variables piwt17 and pwt17 from 2012 onwards readLFS<-function(quarter, year){ #Latest data test<-read.spss(paste0("LFS datasets/",year," Q",quarter,".sav"),to.data.frame=TRUE) #Select variables needed colnames<-c("COUNTRY","SEX","AGE", names(test[grep("PWT", names(test))]), names(test[grep("PIWT", names(test))]), names(test[grep("HIQUAL", names(test))]),"HIGHO", names(test[grep("DEGREE", names(test))]),"SC2KMMJ", "SC10MMJ","ETH11EW", "SNGDEGB","SNGHD","FDSNGDEG","ILODEFR","DISEA","DEGCLS7", "FTPTWK","GRSSWK", "INDE07M","UALA","CASENO","GOVTOF2") #not all variables are in each dataset available_columns<-colnames[which(colnames %in% names(test))] test2<- test[,available_columns] #restrict to population in England aged 16-64 test2<-test2[which(test2$COUNTRY=="England" & test2$AGE>15 & test2$AGE<65),] saveRDS(object = test2, file = paste0("LFS datasets/Q",quarter,"_",year,".rds")) } read_LFS_from_project<-function(quarter, year){ temp <-readRDS(paste0("LFS datasets/Q",quarter,"_", year,".rds")) assign(paste0("Q",quarter,"_",year),temp,envir = globalenv()) } ###################################################################################### #Add in the function to recode variables in the LFS to those used in the publication ###################################################################################### recode_variables<- function(quarter, year){ #fetch the dataset from the global environment: for example Q1_2016 dataset<-get(paste0("Q",quarter,"_",year)) #which HIQUAL variable is in use? #use 'highqual15' for LFS beyond 2014 #For pre 2014 use HIQUAL11<- 2012-2014, HIQUAL5 2006-2011 hiqual_var<-names(dataset[grep("HIQUAL", names(dataset))])[1] if(class(dataset[,c(hiqual_var)])=="numeric"){ #add in error warning if not a factor/string variable paste0("Warning: dataset Q",quarter," ",year,"is missing factor levels in HIQUAL") } # Check if data is the right type, if not convert to a factor with the correct value levels. if("HIQUAL15" %in% names(dataset)& class(dataset$HIQUAL15)=="numeric"){ dataset$HIQUAL15<- as.factor(dataset$HIQUAL15) levels(dataset$HIQUAL15)<-high_qual15 } ##Define Graduate Type #Define graduates, postgraduates and non-graduates dataset$Graduate_Type<-NA #Graduates are those with batchelor degrees only #For 2006 Degree type is set by DEGREE4, not DEGREE7 if("DEGREE71" %in% names(dataset)){ dataset$Graduate_Type[(dataset[,c(hiqual_var)]=="First degree/foundation degree"| dataset[,c(hiqual_var)]=="First/Foundation degree" ) & ( dataset$DEGREE71=="first degree" | dataset$DEGREE72=="first degree" | dataset$DEGREE73=="first degree" | dataset$DEGREE74=="first degree" | dataset$DEGREE75=="first degree" )]<-"Graduate"} if("DEGREE4" %in% names(dataset)){ dataset$Graduate_Type[(dataset[,c(hiqual_var)]=="First degree/foundation degree"| dataset[,c(hiqual_var)]=="First/Foundation degree" ) & ( dataset$DEGREE4=="A first degree" )]<-"Graduate"} # Define postgraduate qualification level #Note this includes 'Other Post Grad degree or professional qual' as in Annex A dataset$Graduate_Type[dataset[,c(hiqual_var)]=="Higher degree" & dataset$HIGHO != "Dont know"]<-"PostGrad" #Define those not in scope of this publication dataset$Graduate_Type[dataset[,c(hiqual_var)]=="Level 8 Diploma"| dataset[,c(hiqual_var)]=="Level 8 Certificate"| dataset[,c(hiqual_var)]=="Level 8 Award"| dataset[,c(hiqual_var)]=="Level 7 Certificate"| dataset[,c(hiqual_var)]=="Level 7 Diploma"| dataset[,c(hiqual_var)]=='Level 7 Award'| dataset[,c(hiqual_var)]=='Level 6 Award'| dataset[,c(hiqual_var)]=='Level 6 Certificate'| dataset[,c(hiqual_var)]=='Level 6 Diploma'| dataset[,c(hiqual_var)]=="Don.t know"| dataset[,c(hiqual_var)]=="Don't know"| dataset[,c(hiqual_var)]=="Did not know"| (dataset[,c(hiqual_var)]=="Higher degree" & dataset$HIGHO =="Dont know")| (dataset[,c(hiqual_var)]=="Higher degree" & is.na(dataset$HIGHO)) ]<-"Not in scope" #Set everyone who isn't a graduate, post graduate or not in scope to a non graduate dataset$Graduate_Type[is.na(dataset$Graduate_Type)]<-"Non Grad" # AgeGroup. dataset$AgeGroup[dataset$AGE %in% 16:20]<-"16-20" dataset$AgeGroup[dataset$AGE %in% 21:30]<-"21-30" dataset$AgeGroup[dataset$AGE %in% 31:40]<-"31-40" dataset$AgeGroup[dataset$AGE %in% 41:50]<-"41-50" dataset$AgeGroup[dataset$AGE %in% 51:60]<-"51-60" dataset$AgeGroup[dataset$AGE %in% 61:64]<-"61-64" #The remaining variables are only added if they occur in the dataset if("SC10MMJ" %in% names(dataset)){ #Define skilled occupation groupings SOCHE1<-c("1 'Managers, Directors And Senior Officials'" ,"2 'Professional Occupations'" ,"3 'Associate Professional And Technical Occupations'") SOCHE2<-c("4 'Administrative And Secretarial Occupations'" ,"5 'Skilled Trades Occupations'" ,"6 'Caring, Leisure And Other Service Occupations'" ) SOCHE3<-c("7 'Sales And Customer Service Occupations'" ,"8 'Process, Plant And Machine Operatives'" ,"9 'Elementary Occupations'" ) dataset$SOCHE[dataset$SC10MMJ %in% SOCHE1]<-"High Skilled Employment" dataset$SOCHE[dataset$SC10MMJ %in% SOCHE2]<-"Medium Skilled Employment" dataset$SOCHE[dataset$SC10MMJ %in% SOCHE3]<-"Low Skilled Employment" dataset$SOCHE[is.na(dataset$SOCHE)]<-"Else" } #Define skilled occupation groupings- name of variable changes in earlier datasets if("SC2KMMJ" %in% names(dataset)){ SOCHE1<-c("1 'Managers and Senior Officials'" ,"2 'Professional occupations'" ,"3 'Associate Professional and Technical'" ) SOCHE2<-c("4 'Administrative and Secretarial'" ,"5 'Skilled Trades Occupations'" ,"6 'Personal Service Occupations'" ) SOCHE3<-c("7 'Sales and Customer Service Occupations'" ,"8 'Process, Plant and Machine Operatives'" ,"9 'Elementary Occupations'" ) dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE1]<-"High Skilled Employment" dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE2]<-"Medium Skilled Employment" dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE3]<-"Low Skilled Employment" dataset$SOCHE[is.na(dataset$SOCHE)]<-"Else" } #Define ethnicity groups if("ETH11EW" %in% names(dataset)){ dataset$ETHNICITY<-dataset$ETH11EW dataset$ETHNICITY[dataset$ETH11EW %in% c("Mixed / Multiple ethnic groups" ,"Chinese","Arab")]<-"Other ethnic group." } #Define Subject Type if("SNGDEGB" %in% names(dataset)){ dataset$STEM[dataset$SNGDEGB %in% 1:9]<- 1 #10.1~Economics, Law, Business dataset$LEM[dataset$SNGDEGB %in% 11:12| (substr(dataset$FDSNGDEG,1,4)=="10.1" & dataset$Graduate_Type=="Graduate")| (substr(dataset$SNGHD,1,4)=="10.1" & dataset$Graduate_Type=="PostGrad")]<-1 #10.0~Social Studies #10.2:10.9~Politics, Sociology, Social Policy, Social Work,Anthropology, #Human and Social Geography, Other Social Studies dataset$OSSAH[dataset$SNGDEGB %in% 13:19| (substr(dataset$FDSNGDEG,1,4) %in% c("10.0","10.2","10.3","10.4","10.5", "10.6","10.7","10.8","10.9") & dataset$Graduate_Type=="Graduate")| (substr(dataset$SNGHD,1,4) %in% c("10.0","10.2","10.3","10.4","10.5", "10.6","10.7","10.8","10.9") & dataset$Graduate_Type=="PostGrad")]<-1 } #Define Occupation. if("SC10MMJ" %in% names(dataset)){ dataset$Occupation<-dataset$SC10MMJ dataset$Occupation<-factor(dataset$Occupation, levels=c(levels(dataset$Occupation),"10 'Medium Skilled Employment'","11 'Low Skilled Employment'")) dataset$Occupation[dataset$SC10MMJ %in% c("4 'Administrative And Secretarial Occupations'", "5 'Skilled Trades Occupations'", "6 'Caring, Leisure And Other Service Occupations'")]<-"10 'Medium Skilled Employment'" dataset$Occupation[dataset$SC10MMJ %in% c("7 'Sales And Customer Service Occupations'" , "8 'Process, Plant And Machine Operatives'" ,"9 'Elementary Occupations'" )]<-"11 'Low Skilled Employment'" } #Labels Occupation #1 'Managers, Directors and Senior Officials' #2 'Professional Occupations' #3 'Associate Professional and Technical Occupations' #10 'Medium Skilled Employment' #11 'Low Skilled Employment'. assign(paste0("Q",quarter,"_",year),dataset,envir = globalenv()) } ######################################################################################### # Employment rate by demographic # ######################################################################################### #Headline stats: #for 16-64s Headline_stats_function<-function(all,agebracket){ #subset data if needed if(agebracket=="16-64"){all<- all}else{all<-all[which(all$AgeGroup=="21-30"),]} all %>% group_by(Graduate_Type)%>% summarise(total=sum(freq),sample=sum(sample))->full_pop types<-full_pop["Graduate_Type"] all[which(all$ILODEFR=="In employment"),] %>% group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>% merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->in_employment all[which(all$ILODEFR=="In employment" & all$SOCHE=="High Skilled Employment"),] %>% group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>% merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->in_HSemployment all[which(all$ILODEFR=="ILO unemployed"),] %>% group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>% merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->unemployed all[which(all$ILODEFR=="Inactive"),] %>% group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>% merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->inactive Headline_stats<-data.frame( in_employment$Graduate_Type, in_employment$count/full_pop$total, in_HSemployment$count/full_pop$total, unemployed$count/(unemployed$count + in_employment$count), inactive$count/full_pop$total, in_employment$sample, in_HSemployment$sample, unemployed$sample, inactive$sample, in_employment$count, in_HSemployment$count, unemployed$count, inactive$count) Headline_stats$Age_range<-agebracket colnames(Headline_stats)<-c("Population","Employment_rate","High_skill_emp_rate","Unemployment_rate","Inactivity_rate", "Employed sample","HS employed sample", "unemployed sample", "inactive sample","employed total","HS employed total", "unemployed total","inactive total","Age_range") Headline_stats<-Headline_stats[c("Age_range","Population","Employment_rate","High_skill_emp_rate","Unemployment_rate","Inactivity_rate", "Employed sample","HS employed sample", "unemployed sample", "inactive sample","employed total","HS employed total", "unemployed total","inactive total")] } Employment_rate<-function(quarter, year){ #get the dataset from the global environment dataset<-get(paste0("Q",quarter,"_",year)) #find the weight variable weight<-names(dataset[grep("PWT", names(dataset))])[1] dataset<-dataset[which(dataset[,weight]>0),] # count all people by age, graduate type, ILO employment status and SOC group: weight people by the weight variable (such as PWT16) #also note sample size all<-dataset%>% group_by(AgeGroup,Graduate_Type,ILODEFR,SOCHE) %>% summarise(freq=sum(!!sym(weight)), sample=n()) Headline_stats<-Headline_stats_function(all,"16-64") #Headline stats for 21-30s Headline_stats_young<-Headline_stats_function(all,"21-30") headline_stats<-rbind(Headline_stats,Headline_stats_young) assign(paste0("Headline_Q",quarter,"_",year),headline_stats,envir = globalenv()) #write.csv(headline_stats,paste0("Outputs for GLMS/Headline_Q",quarter,"_",year,".csv")) } #combine employment rates for each quarter into a yearly mean average year_average_employment_rate<-function(year){ #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset' dataset1<-get(paste0("Headline_Q1_",year)) dataset2<-get(paste0("Headline_Q2_",year)) dataset3<-get(paste0("Headline_Q3_",year)) dataset4<-get(paste0("Headline_Q4_",year)) year_data<-rbind(dataset1,dataset2,dataset3,dataset4) year_data%>% group_by( Age_range, Population) %>% summarise(Employment_rate=mean.default(Employment_rate), High_skill_emp_rate= mean.default(High_skill_emp_rate), Unemployment_rate=mean.default(Unemployment_rate), inactivity_rate= mean.default(Inactivity_rate), employed_sample=sum(`Employed sample`), HS_employed_sample= sum(`HS employed sample`), unemployed_sample=sum(`unemployed sample`), inactive_sample=sum(`inactive sample`), employed_total=sum(`employed total`), HS_employed_total=sum(`HS employed total`), unemployed_total=sum(`unemployed total`), inactive_total=sum(`inactive total`)) -> year_average year_average$year<-year assign(paste0("Headline_",year),year_average,envir = globalenv()) #write.csv(year_average,paste0("Outputs for GLMS/Headline_",year,".csv")) } Employment_counts_by<-function(dataset,demographic){ weight<-names(dataset[grep("PWT", names(dataset))])[1] as.data.frame( dataset%>% group_by_(demographic,"Graduate_Type","ILODEFR","SOCHE")%>% dplyr::summarise(sample=n(), population=sum(!!sym(weight)))) } variable_splits<-function(dataset, variable_y){ dataset %>% group_by_(variable_y)%>% summarise(total=sum(population))->full_pop dataset[which(dataset$ILODEFR=="In employment"),] %>% group_by_(variable_y)%>% summarise(employed.count=sum(population), employed.sample=sum(sample))->in_employment dataset[which(dataset$ILODEFR=="In employment" & dataset$SOCHE=="High Skilled Employment"),] %>% group_by_(variable_y)%>% summarise(HSemp.count=sum(population),HSemp.sample=sum(sample))->in_HSemployment dataset[which(dataset$ILODEFR=="ILO unemployed"),] %>% group_by_(variable_y)%>% summarise(Unemp.count=sum(population),Unemp.sample=sum(sample))->unemployed dataset[which(dataset$ILODEFR=="Inactive"),] %>% group_by_(variable_y)%>% summarise(Inactive.count=sum(population),Inactive.sample=sum(sample))->inactive list<-list(full_pop,in_employment,in_HSemployment, unemployed,inactive) total_count<-Reduce(function(x, y) merge(x, y, all.x=T, by=c(variable_y)), list, accumulate=F) Graduate_breakdown<-data.frame( total_count[variable_y], total_count$employed.count/total_count$total, total_count$employed.sample, total_count$HSemp.count/total_count$total, total_count$HSemp.sample, total_count$Unemp.count/(total_count$Unemp.count + total_count$employed.count), total_count$Unemp.sample, total_count$Inactive.count/total_count$total, total_count$Inactive.sample) colnames(Graduate_breakdown)<-c("Breakdown","Employment_rate","Employed_sample", "High_skill_emp_rate","High_Skill_employed_sample", "Unemployment_rate","Unemployed_sample", "Inactivity_rate","Inactive_sample") return(Graduate_breakdown) } #provide employment rates for the year broken down by characteristic Graduate_breakdown<-function(quarter,year){ #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset' dataset<-get(paste0("Q",quarter,"_",year)) #reduce dataset to just graduates dataset<-dataset[which(dataset$Graduate_Type=="Graduate"),] Employment_counts_by(dataset,"AgeGroup")%>% variable_splits(.,"AgeGroup")->Age_group_rates Age_group_rates$Population<-"AgeGroup" Employment_counts_by(dataset,"SEX") %>% variable_splits(.,"SEX")->Gender_rates Gender_rates$Population<-"SEX" Employment_counts_by(dataset,"ETHNICITY") %>% variable_splits(.,"ETHNICITY")->Ethnicity_rates Ethnicity_rates$Population<-"ETHNICITY" Employment_counts_by(dataset,"DISEA") %>% variable_splits(.,"DISEA")->Disability_rates Disability_rates$Population<-"DISEA" Employment_counts_by(dataset,"DEGCLS7") %>% variable_splits(.,"DEGCLS7")->Degree_class_rates Degree_class_rates$Population<-"DEGCLS7" Employment_counts_by(dataset,"STEM") %>% variable_splits(.,"STEM")->STEM_rates STEM_rates$Population<-"STEM" Employment_counts_by(dataset,"LEM") %>% variable_splits(.,"LEM")->LEM_rates LEM_rates$Population<-"LEM" Employment_counts_by(dataset,"OSSAH") %>% variable_splits(.,"OSSAH")->OSSAH_rates OSSAH_rates$Population<-"OSSAH" Employment_counts_by(dataset,"GOVTOF2") %>% variable_splits(.,"GOVTOF2")->Region_rates Region_rates$Population<-"GOVTOF2" quarter<-rbind(Age_group_rates,Gender_rates,Ethnicity_rates, Disability_rates, Degree_class_rates, STEM_rates, LEM_rates, OSSAH_rates,Region_rates) quarter$Group<-"Graduates:16-64" #reduce dataset to just young graduates dataset<-dataset[which(dataset$Graduate_Type=="Graduate" & dataset$AgeGroup=="21-30"),] Employment_counts_by(dataset,"SEX") %>% variable_splits(.,"SEX")->Gender_rates Gender_rates$Population<-"SEX" Employment_counts_by(dataset,"ETHNICITY") %>% variable_splits(.,"ETHNICITY")->Ethnicity_rates Ethnicity_rates$Population<-"ETHNICITY" Employment_counts_by(dataset,"DISEA") %>% variable_splits(.,"DISEA")->Disability_rates Disability_rates$Population<-"DISEA" Employment_counts_by(dataset,"DEGCLS7") %>% variable_splits(.,"DEGCLS7")->Degree_class_rates Degree_class_rates$Population<-"DEGCLS7" Employment_counts_by(dataset,"STEM") %>% variable_splits(.,"STEM")->STEM_rates STEM_rates$Population<-"STEM" Employment_counts_by(dataset,"LEM") %>% variable_splits(.,"LEM")->LEM_rates LEM_rates$Population<-"LEM" Employment_counts_by(dataset,"OSSAH") %>% variable_splits(.,"OSSAH")->OSSAH_rates OSSAH_rates$Population<-"OSSAH" Employment_counts_by(dataset,"GOVTOF2") %>% variable_splits(.,"GOVTOF2")->Region_rates Region_rates$Population<-"GOVTOF2" quarter2<-rbind(Age_group_rates,Gender_rates,Ethnicity_rates, Disability_rates, Degree_class_rates, STEM_rates, LEM_rates, OSSAH_rates,Region_rates) quarter2$Group<-"Graduates:21-30" quarter<-rbind(quarter,quarter2) return(quarter) } Graduate_breakdown_year<-function(year){ #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset1' dataset1<-Graduate_breakdown(1,year) dataset2<-Graduate_breakdown(2,year) dataset3<-Graduate_breakdown(3,year) dataset4<-Graduate_breakdown(4,year) year_data<- rbind(dataset1,dataset2,dataset3,dataset4) means<-ddply(year_data, .(Group,Population, Breakdown), colwise(mean, .(Employment_rate, High_skill_emp_rate, Unemployment_rate, Inactivity_rate),na.rm=TRUE)) samples<-ddply(year_data, .(Group,Population, Breakdown),colwise(sum, .(Employed_sample, High_Skill_employed_sample, Unemployed_sample, Inactive_sample),na.rm=TRUE)) output<-merge(x=means, y=samples, by=c("Group","Population","Breakdown"), all=TRUE) output<-output[c("Group", "Population", "Breakdown", "Employment_rate", "Employed_sample", "High_skill_emp_rate", "High_Skill_employed_sample", "Unemployment_rate", "Unemployed_sample", "Inactivity_rate", "Inactive_sample")] assign(paste0("Graduate_breakdown_",year),output,envir = globalenv()) write.csv(output,paste0("Outputs for GLMS/Graduate_breakdown_",year,".csv")) } ######################################################################################### # Average Salary by demographic # ######################################################################################### #Find the median salaries by demographic characteristic, and reshape (melt) data to have the #characteristic as a variable rather than column name Average_salary_by<-function(dataset,demographic){ #find the weight variable weight<-names(dataset[grep("PIWT", names(dataset))])[1] as.data.frame( dataset%>% group_by_(demographic)%>% dplyr::summarise(median=weighted.median(GRSSWK,!!sym(weight)), sample=n(), population=sum(!!sym(weight))))%>% melt(id=c("median","sample","population")) #take a dataset; based on the demographic characeristic separate the dataet into groups; find the #median of GRSSWK (gross weekly pay) and weight on PIWT17 (where approriate) #reshape dataset for ease of output- turn the demographic into a variable rather than a column heading } Average_Salaries<-function(year){ #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset' dataset1<-get(paste0("Q1_",year)) dataset2<-get(paste0("Q2_",year)) dataset3<-get(paste0("Q3_",year)) dataset4<-get(paste0("Q4_",year)) #rename weight variable reformat_dataset<-function(dataset){ colnames<-c(names(dataset[grep("PIWT", names(dataset))][1]), "Graduate_Type", "AgeGroup", "ILODEFR", "FTPTWK", "GRSSWK") dataset<- dataset[,colnames] weight_var<-names(dataset[grep("PIWT", names(dataset))])[1] weight_index<-match(colnames(dataset),weight_var,0) colnames(dataset)[colnames(dataset) %in% weight_var] <- c("PIWT") return(dataset) } dataset1<-reformat_dataset(dataset1) dataset2<-reformat_dataset(dataset2) dataset3<-reformat_dataset(dataset3) dataset4<-reformat_dataset(dataset4) dataset<-rbind(dataset1, dataset2, dataset3, dataset4) #Restrict to just those in full time employment dataset<-dataset[which(dataset$FTPTWK=="Full-time" & dataset$ILODEFR=="In employment"),] #find the weight variable for the dataset weight<-names(dataset[grep("PIWT", names(dataset))])[1] #restrict to just those with an income weight. Those with 0 weight wouldn't be included in the weighted median anyway, but removing at this stage speeds up computation. dataset<-dataset[which(dataset[,weight]!=0),] ################################################# # Gross Weekly pay in main job- # ################################################# ##Split by Age and Graduate Type all<-as.data.frame( dataset%>% group_by(AgeGroup, Graduate_Type)%>% dplyr::summarise(median=weighted.median(GRSSWK,!!sym(weight)), sample=n(), population=sum(!!sym(weight)))) #Just split by graduate type (i.e. across all agebands) #Use the function just defined above all_total<- Average_salary_by(dataset,"Graduate_Type") #Add in an 'agegroup' for the average weekly salaries by graduate type and rename the column headers: #this way we can merge the datasets as they'll have the same headers all_total$variable<-"Total" names(all_total)<-c("median", "sample", "population", "AgeGroup", "Graduate_Type") #append the average salaries for people by graduate type and ageband with those for all agebands. all_sal<-rbind(all, all_total) all_sal$year<-year #rename the young_grad_sal dataset to reference the year and quarter it's run on and output to the global environment. assign(paste0("Sal_",year),all_sal,envir = globalenv()) } ############################################################################################# # Restrict to just graduates: Graduate Breakdown salaries # # Most recent year only ############################################################################################# Graduate_breakdown_salaries<-function(year){ #select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset1' dataset1<-get(paste0("Q1_",year)) dataset2<-get(paste0("Q2_",year)) dataset3<-get(paste0("Q3_",year)) dataset4<-get(paste0("Q4_",year)) dataset<-rbind(dataset1, dataset2, dataset3, dataset4) #Restrict to just those in full time employment dataset<-dataset[which(dataset$FTPTWK=="Full-time" & dataset$ILODEFR=="In employment"),] #find the weight variable for the dataset weight<-names(dataset[grep("PIWT", names(dataset))])[1] #restrict to just those with an income weight. Those with 0 weight wouldn't be included in the weighted median anyway, but removing at this stage speeds up computation. dataset<-dataset[which(dataset[,weight]!=0),] #restrict to just graduates dataset<-dataset[which(dataset$Graduate_Type=="Graduate"),] #Average salary by ageband age<-Average_salary_by(dataset,"AgeGroup") #Repeat for average salary by gender sex<-Average_salary_by(dataset,"SEX") #Average salary by ethnicity ethnicity<-Average_salary_by(dataset,"ETHNICITY") #Average salary by disability status disability<-Average_salary_by(dataset,"DISEA") #Average salary by degree classification degreeclass<-Average_salary_by(dataset,"DEGCLS7") #Average salary by STEM/non-STEM graduates stem<-Average_salary_by(dataset,"STEM") #Average salary by LEM/non-LEM graduates lem<-Average_salary_by(dataset,"LEM") #Average salary by OSSAH/non-OSSAH graduates ossah<-Average_salary_by(dataset,"OSSAH") #Average salary by occupational group occupation<-Average_salary_by(dataset,"Occupation") #Average salary by industry category industry<-Average_salary_by(dataset,"INDE07M") #Average salary by region region<-Average_salary_by(dataset,"GOVTOF2") #append datasets to each other to create one dataset of graduate salaries for different demographic characteristics grad_sal<-rbind(age, sex, ethnicity, disability, degreeclass, stem, lem, ossah, occupation, industry, region) #rename the grad_sal dataset to reference the year and quarter it's run on and output to the global environment. assign(paste0("grad_sal_",year),grad_sal,envir = globalenv()) write.csv(grad_sal,paste0("Outputs for GLMS/GRAD_SAL_",year,".csv")) ######################################################### # Repeat but just for young graduates(21-30) # ######################################################### dataset<-dataset[which(dataset$Graduate_Type=="Graduate" & dataset$AgeGroup=="21-30"),] #Average salary by ageband age<-Average_salary_by(dataset,"AgeGroup") #Repeat for average salary by gender sex<-Average_salary_by(dataset,"SEX") #Average salary by ethnicity ethnicity<-Average_salary_by(dataset,"ETHNICITY") #Average salary by disability status disability<-Average_salary_by(dataset,"DISEA") #Average salary by degree classification degreeclass<-Average_salary_by(dataset,"DEGCLS7") #Average salary by STEM/non-STEM graduates stem<-Average_salary_by(dataset,"STEM") #Average salary by LEM/non-LEM graduates lem<-Average_salary_by(dataset,"LEM") #Average salary by OSSAH/non-OSSAH graduates ossah<-Average_salary_by(dataset,"OSSAH") #Average salary by occupational group occupation<-Average_salary_by(dataset,"Occupation") #Average salary by industry category industry<-Average_salary_by(dataset,"INDE07M") #Average salary by region region<-Average_salary_by(dataset,"GOVTOF2") #append datasets to each other to create one dataset of graduate salaries for different demographic characteristics young_grad_sal<-rbind(age, sex, ethnicity, disability, degreeclass, stem, lem, ossah, occupation, industry, region) #rename the young_grad_sal dataset to reference the year and quarter it's run on and output to the global environment. assign(paste0("young_grad_sal_",year),young_grad_sal,envir = globalenv()) #Also write to an Excel file for ease of use by economics team. write.csv(young_grad_sal,paste0("Outputs for GLMS/YOUNG_GRAD_SAL_",year,".csv")) } ####################################################### # 1. Import data # ####################################################### start_year=2006 #Which year do you want to use LFS data from? end_year=2017 #Which year do you want the LFS data up to? #### OPTIONAL STEP### #Convert all LFS data stored in SPSS format from Q1 of the start year to Q4 of the end year into R data format #unless this has been previously completed and files are in your R working directory. In which case skip this step. #sapply(start_year:end_year, function(y)sapply(1:4,function(x)readLFS(x,y))) #Note data is sent to a file LFS datasets within your project file: make sure the file is created ##################### #After Import/If you have the .rds files already run sapply(start_year:end_year, function(y)sapply(1:4, function(x) read_LFS_from_project(x,y))) #use the attributes of Q1_2016 to recode the numbers as text strings if HIQUAL15 isn't stored as a factor high_qual15<-attributes(Q1_2016$HIQUAL15)[[1]] #Recode the LFS variables to those used in the publication sapply(start_year:end_year, function(y)sapply(1:4, function(x) recode_variables(x,y))) #Output employment rates #This outputs the proprotion in employment for each quarter. sapply(start_year:end_year, function(y)sapply(1:4, function(x) Employment_rate(x,y))) #Create headline statistics for each year (mean average of rates in each quarter) sapply(start_year:end_year,function(x)year_average_employment_rate(x)) list<-lapply(start_year:end_year,function(x)get(paste0("Headline_",x))) Yearly_employment<-Reduce(rbind,list) write.csv(Yearly_employment,"Outputs for GLMS/Headline_employment_rates.csv") #Create the graduate breakdown for the latest year Graduate_breakdown_year(end_year) #Output weekly salaries #This uses the data for the whole year to output median salaries sapply(start_year:end_year, function(y)Average_Salaries(y)) list<-lapply(start_year:end_year,function(x)get(paste0("Sal_",x))) Yearly_salaries<-Reduce(rbind,list) write.csv(Yearly_salaries,"Outputs for GLMS/Yearly_salaries.csv") #And salaries for the graduate breakdowns Graduate_breakdown_salaries(end_year) ####################################### #And confidence intervals ####################################### # See link below for a discussion of accounting for survey design and weighting # https://www.ons.gov.uk/methodology/methodologicalpublications/generalmethodology/onsworkingpaperseries/onsmethodologyworkingpaperseriesno9guidetocalculatingstandarderrorsforonssocialsurveys#accounting-for-the-estimation-method options(survey.lonely.psu="adjust") #lonley PSUs make no contribution to the variance. confidence_intervals<-function(quarter,year,young){ dataset_name<-paste0("Q",quarter,"_",year) dataset<-get(dataset_name) #find the weight variable weight<-names(dataset[grep("PWT", names(dataset))])[1] dataset<-dataset[which(dataset[,weight]>=0),] #subest to 21-30s if young=TRUE if(young==TRUE){dataset<-dataset[which(dataset$AgeGroup=="21-30"),]} dataset$hhld_id<-substring(dataset$CASENO,1,13) #define graduates, post graduates and non_graduates dataset$Grad<-0 dataset$Grad[dataset$Graduate_Type=="Graduate"]<-1 dataset$PostGrad<-0 dataset$PostGrad[dataset$Graduate_Type=="PostGrad"]<-1 dataset$NonGrad<-0 dataset$NonGrad[dataset$Graduate_Type=="Non Grad"]<-1 #define employed graduates, post graduate and non graduates dataset$Emp_Grad<-0 dataset$Emp_Grad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Graduate"]<-1 dataset$Emp_PostGrad<-0 dataset$Emp_PostGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="PostGrad"]<-1 dataset$Emp_NonGrad<-0 dataset$Emp_NonGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Non Grad"]<-1 #define graduates, post graduates and non graduates in high skilled employment dataset$HSEmp_Grad<-0 dataset$HSEmp_Grad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Graduate"& dataset$SOCHE=="High Skilled Employment"]<-1 dataset$HSEmp_PostGrad<-0 dataset$HSEmp_PostGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="PostGrad"& dataset$SOCHE=="High Skilled Employment"]<-1 dataset$HSEmp_NonGrad<-0 dataset$HSEmp_NonGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Non Grad"& dataset$SOCHE=="High Skilled Employment"]<-1 #define Unemployed graduates, post graduate and non graduates dataset$Unemp_Grad<-0 dataset$Unemp_Grad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="Graduate"]<-1 dataset$Unemp_PostGrad<-0 dataset$Unemp_PostGrad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="PostGrad"]<-1 dataset$Unemp_NonGrad<-0 dataset$Unemp_NonGrad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="Non Grad"]<-1 #define economically active graduates, post graduate and non graduates dataset$Act_Grad<-0 dataset$Act_Grad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment") & dataset$Graduate_Type=="Graduate"]<-1 dataset$Act_PostGrad<-0 dataset$Act_PostGrad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment") & dataset$Graduate_Type=="PostGrad"]<-1 dataset$Act_NonGrad<-0 dataset$Act_NonGrad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment") & dataset$Graduate_Type=="Non Grad"]<-1 #dataset<<-dataset #Define the survey design: #clusters- households #strata- low level area identifier (Unitary Local Authoritary) #weight- person weight weight_formula<-as.formula(paste0("~",weight)) surveydesign<-svydesign(id=~hhld_id, data=dataset, strata=~UALA, weight=weight_formula,nest=TRUE) #surveydesign<<-surveydesign #work out the standard error for graduates #Note the programe saves the variance = SE^2 ratio_and_confidence_int<-function(Numerator_input, Demoninator_input){ Numerator<-as.formula(paste0("~",Numerator_input)) Denominator<-as.formula(paste0("~",Demoninator_input)) #calcualte mean and standard error ratio_output<-svyratio(numerator=Numerator,denominator=Denominator, design = surveydesign) #calcualte confidence intervals confidence_int<-confint(ratio_output, level = 0.95,df =degf(surveydesign)) #reformat to dataframe ratio_output<-data.frame(matrix(unlist(ratio_output),nrow=1, byrow=T),stringsAsFactors = FALSE) confidence_int<-data.frame(matrix(unlist(confidence_int),nrow=1, byrow=T),stringsAsFactors = FALSE) #We sqare root to get back to the standard error ratio_output$X2<-sqrt(as.numeric(as.character(ratio_output$X2))) #summarise as 1 dataframe output<-data.frame(cbind(ratio_output$X1, confidence_int, ratio_output$X2),stringsAsFactors = FALSE) numerator_type<-strsplit(Numerator_input,"_")[[1]][1] names(output)<-c(paste0(numerator_type,"_prop"),paste0(numerator_type,"_prop_lower"), paste0(numerator_type,"_prop_upper"),paste0(numerator_type,"_SE")) output$population<-strsplit(Numerator_input,"_")[[1]][2] output$Quarter<-dataset_name return(output) } #graduates grad_emp<-ratio_and_confidence_int("Emp_Grad","Grad") grad_HSemp<-ratio_and_confidence_int("HSEmp_Grad","Grad") grad_Unemp<-ratio_and_confidence_int("Unemp_Grad","Act_Grad") list<-list(grad_emp, grad_HSemp, grad_Unemp) grad<-Reduce(function(x, y) merge(x, y, all.x=T, by=c("population","Quarter")), list, accumulate=F) #postgraduates PostGrad_emp<-ratio_and_confidence_int("Emp_PostGrad","PostGrad") PostGrad_HSemp<-ratio_and_confidence_int("HSEmp_PostGrad","PostGrad") PostGrad_Unemp<-ratio_and_confidence_int("Unemp_PostGrad","Act_PostGrad") list<-list(PostGrad_emp, PostGrad_HSemp, PostGrad_Unemp) PostGrad<-Reduce(function(x, y) merge(x, y, all.x=T, by=c("population","Quarter")), list, accumulate=F) #Non graduates NonGrad_emp<-ratio_and_confidence_int("Emp_NonGrad","NonGrad") NonGrad_HSemp<-ratio_and_confidence_int("HSEmp_NonGrad","NonGrad") NonGrad_Unemp<-ratio_and_confidence_int("Unemp_NonGrad","Act_NonGrad") list<-list(NonGrad_emp, NonGrad_HSemp, NonGrad_Unemp) NonGrad<-Reduce(function(x, y) merge(x, y, all.x=T, by=c("population","Quarter")), list, accumulate=F) ratios<-rbind(grad,PostGrad,NonGrad) #rename to keep in sync with Graduate_Types ratios$population[ratios$population=="Grad"]<-"Graduate" ratios$population[ratios$population=="NonGrad"]<-"Non Grad" #Also want the sample and population sizes dataset %>% group_by(ILODEFR,Graduate_Type)%>% summarise(count=n(),weighted_pop=sum(!!sym(weight))) %>% gather(variable, value, -(Graduate_Type:ILODEFR)) %>% unite(temp, ILODEFR, variable) %>% spread(temp, value)->samples dataset[which(dataset$SOCHE=="High Skilled Employment" & dataset$ILODEFR=="In employment"),] %>% group_by(Graduate_Type)%>% summarise(HS_emp_count=n(),HS_emp_weighted_pop=sum(!!sym(weight))) ->sample2 samples<-merge(x=samples, y=sample2, by=("Graduate_Type")) colnames(samples)[colnames(samples)=="Graduate_Type"]<-"population" ratios<-merge(x=ratios, y=samples, by=c("population"), all.x=TRUE) return(ratios) } #Find confidence intervals from start year to end year output<-lapply(start_year:end_year, function(y){lapply(1:4,function(x)confidence_intervals(x,y,FALSE))}) #store as dataframe output2<-Reduce(rbind,lapply(1:length(output),function(x)Reduce(rbind, output[[x]]))) output2$year<-substring(output2$Quarter,4,7) output2$Quarter<-substring(output2$Quarter,2,2) output2<-output2[c("population","year",names(output2)[2:22])] #sort by population and quarter output3<-output2[with(output2, order(population, year, Quarter)), ] output3$age<-"16-64" #repeat for young_graduates #Find confidence intervals from start year to end year output<-lapply(start_year:end_year, function(y){lapply(1:4,function(x)confidence_intervals(x,y,TRUE))}) #store as dataframe output2<-Reduce(rbind,lapply(1:length(output),function(x)Reduce(rbind, output[[x]]))) output2$year<-substring(output2$Quarter,4,7) output2$Quarter<-substring(output2$Quarter,2,2) output2<-output2[c("population","year",names(output2)[2:22])] #sort by population and quarter output4<-output2[with(output2, order(population, year, Quarter)), ] output4$age<-"21-30" output5<-rbind(output3,output4) write.csv(output5,paste0("Outputs for GLMS/Emp_confidence_Intervals.csv"))