--- title: "Tourist2022EN" author: "Kai Wu" date: "2022/12/5" output: html_document --- ### tourist satisfaction survey dataset(simulated data,45 variables,373 cases) ###(1)questionnaire https://od.lk/d/178312136_WyH9N/tourist_satisfaction_questionnaire_en.docx https://od.lk/d/178312135_ZW7sa/tourist_satisfaction_questionnaire_en.pdf ###(2)DATA https://od.lk/d/178313264_Tr9Ti/data_tourist_satisfaction.csv https://od.lk/d/178313263_MVBwn/data_tourist_satisfaction_en.xlsx ###(3)Rmarkdown file(codes for data analysis) ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # 1.data preparation alternative terms of data preparation: data wrangling、data munging、data cleaning https://theappsolutions.com/blog/development/data-wrangling-guide-to-data-preparation/ Data Wrangling in R (1)Dplyr - essential data-munging R package. Supreme data framing tool. Especially useful for data management operating by categories. (2)Purrr - good for list function operations and error-checking. (3)Splitstackshape - an oldie but goldie. Good for shaping complex data sets and simplifying the visualization. (4)JSOnline - nice and easy parsing tool. (5)Magrittr - good for wrangling scattered sets and putting them into a more coherent form. ## 1.1 load packages ```{r} if(!isTRUE(require("Hmisc"))){install.packages("Hmisc")}#statistical analysis if(!isTRUE(require("psych"))){install.packages("psych")}#statistical analysis if(!isTRUE(require("lavaan"))){install.packages("lavaan")} #The lavaan package is developed to provide useRs, researchers and teachers a free open-source, but commercial-quality package for latent variable modeling if(!isTRUE(require("broom"))){install.packages("broom")}# tidy report if(!isTRUE(require("ggplot2"))){install.packages("ggplot2")}# data visualization if(!isTRUE(require("ggrepel"))){install.packages("ggrepel")} # provides geoms for ggplot2 to repel overlapping text labels if(!isTRUE(require("corrplot"))){install.packages("corrplot")} #A visualization of a correlation matrix if(!isTRUE(require("semPlot"))){install.packages("semPlot")} #Description Path diagrams and visual analysis of various SEM packages' output if(!isTRUE(require("leaflet"))){install.packages("leaflet")}# maps based on openstreet data ``` install packages ```{r} #install.packages(Hmisc) #install.packages(psych) #install.packages(lavaan) #install.packages(broom) #install.packages(ggplot2) #install.packages(ggrepel) #install.packages(corrplot) #install.packages(semPlot) #install.packages(leaflet) ``` ## 1.2 import dataset datafolder is your working folder, you can modify this by yourself ```{r} #datafolder is your working folder, you can modify this by yourself datafolder<-'D:/pdata/rdata/tourist_en/' ``` ```{r} #tourist<-read.csv('https://od.lk/d/178313264_Tr9Ti/data_tourist_satisfaction.csv',header=TRUE,encoding = 'UTF-8') tourist<-read.csv(paste(datafolder,'data_tourist_satisfaction.csv',sep=""),header=TRUE,encoding = 'UTF-8') tail(tourist) ##### practice 1 : download the data and import the data into the working space ``` ## 1.3 value labels add value labels to categorical variables. ```{r} #value labels tourist$gender <- factor(tourist$gender,levels = c(1,2),labels = c("male", "female")) tourist$thotel <- factor(tourist$thotel,levels = c(1,2,3,4),labels = c("budget hotel", "luxury hotel", "bed and breakfast", "apartment hotel")) tourist$type3 <- factor(tourist$type3,levels = c(1,2,3),labels = c("natural scenery", "historical scenery", "mixed scenery")) tourist$type2 <- factor(tourist$type2,levels = c(1,2),labels = c("sightseeing", "participation")) head(tourist) ##### practice02 ##### add value labels to variable of region ##### 1 Central China ##### 2 East China ##### 3 North China ##### 4 Northeast China ##### 5 Northwest China ##### 6 Southwest China ##### 7 West China ``` ## 1.4 compute variables #### compute variables ```{r} #compute the overall of satisfaction tourist<-transform(tourist,sat=(sat1+sat2+sat3+sat4+sat5+sat6)/6) summary(tourist$sat) ``` ```{r} #compute variables # practice 03 : compute the age # age = 2022 - byear ``` ```{r} #compute variables # practice 03 : compute the age # age = 2022 - byear tourist<-transform(tourist,age = 2022 - byear) ``` ## 1.5 recode ```{r} # recode income3 (income range) attach(tourist) tourist$income3[income < 2000] <- 1 tourist$income3[income >= 2000 & income <3000] <- 2 tourist$income3[income >= 3000] <- 3 detach(tourist) # add value labels tourist$income3 <- factor(tourist$income3,levels = c(1,2,3),labels = c("below 2000", "2000-3000", "above 3000")) summary(tourist$income3) ``` ```{r} # recode tourism expense attach(tourist) tourist$expense3[expense < 300] <- 1 tourist$expense3[expense >= 300 & expense < 400] <- 2 tourist$expense3[expense > 400] <- 3 detach(tourist) # add value labels tourist$expense3 <- factor(tourist$expense3,levels = c(1,2,3),labels = c("below 300", "300-399", "above 400")) summary(tourist$expense3) ``` ```{r} # recode age range # practice 04 : recode age ``` ```{r} # recode age range attach(tourist) tourist$age4[age < 20] <- 1 tourist$age4[age >= 20 & age < 40] <- 2 tourist$age4[age >= 40 & age < 60] <- 3 tourist$age4[age >= 60] <- 4 detach(tourist) # add value labels tourist$age4 <- factor(tourist$age4,levels = c(1,2,3,4),labels = c("below 20", "20-39","40-59", "above 60")) summary(tourist$age4) ``` ## 1.6 summary() ### look through the dataset ```{r} summary(tourist) ``` ## 1.7 distribution of the data ```{r} # density chart plot(density(tourist$sat)) plot(density(tourist$sat),main="", xlab='overall satisfaction', ylab='frequenccy') # histgram chart hist(tourist$sat) hist(tourist$sat,main="", xlab='overall satisfaction data', ylab='frequency') ``` ```{r} # practice 05 : check the distribution of income ``` for loop: distribution of 31 variables ```{r} #for (i in 10:40){ # plot(density(tourist[,i])) # plot(density(tourist[,i]),main="", xlab=names(tourist[i])) #} ``` for loop: distribution of 31 variables histgram chart ```{r} #for (i in 10:40){ # hist(tourist[,i],main="", xlab=names(tourist[i])) # h<-hist(tourist[,i],plot=FALSE) # hist(tourist[,i],main="",xlab=names(tourist[i]),labels = TRUE,ylim=c(0,1.1*max(h$counts))) # https://stackoverflow.com/questions/9317948/how-to-label-histogram-bars-with-data-values-or-percents-in-r #} ``` ## 1.8 save the dataset ```{r} save(tourist,file=paste(datafolder,'tourist_EN.rda',sep="")) ``` # 2. basic analysis ## 2.1 frequency analysis ```{r} #frequency table of gender t1<-table(tourist$gender) t1 #percentage of gender prop.table(t1) round(prop.table(t1)*100,2) ``` for tidy report,we can transfer the table into data frame ```{r} #t1b is on frequency t1b<-as.data.frame(table(tourist$gender)) # t1c is on percentage t1c<-as.data.frame(round(prop.table(t1)*100,2)) #combine t1b and t1c t1d<-cbind(t1b,t1c$Freq) #rename the variable names colnames(t1d)<-c('gender','frequency','percent') print(t1d) #export the result as a csv file. write.csv(t1d,paste(datafolder,'tourist_gender.csv',sep="")) ``` ```{r} #frequency of hotel type t2<-table(tourist$thotel) t2 round(prop.table(t2)*100,2) ``` ```{r} ##### practice 06 frequency analysis on region ``` ```{r} hist(tourist$byear) # with labels(counts) h2<-hist(tourist$byear,plot=FALSE) hist(tourist$byear,main="", xlab='birth year') hist(tourist$byear,main="", xlab='birth year',labels = TRUE) hist(tourist$byear,main="", xlab='birth year',labels = TRUE,ylim=c(0,50)) hist(tourist$byear,main="", xlab='birth year',labels = TRUE,ylim=c(0,50),col='lightblue') hist(tourist$byear,main="", xlab='birth year',labels = TRUE,ylim=c(0,50),col='#ffb01f') #html color codes #https://html-color.codes/ # r colors names # http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf # https://www.datamentor.io/r-programming/color/ ``` ```{r} ##### practice 7 : ##### step1 : 1.4 compute age based on birth year ##### step2 : 1.7 check the distribution of age ##### step3 : 1.5 record age variable age6 ##### (below 20,20-29,30-39,40-49,50-59,above 60) ##### step4 : 2.1 conduct frequency analysis of age6 ``` ```{r} # solutions to practice 07 age<-2022-tourist$byear hist(age) attach(tourist) tourist$age6[age < 20] <- 1 tourist$age6[age >= 20 & age < 30] <- 2 tourist$age6[age >= 30 & age < 40] <- 3 tourist$age6[age >= 40 & age < 50] <- 4 tourist$age6[age >= 50 & age < 60] <- 5 tourist$age6[age >= 60] <- 6 detach(tourist) tourist$age6 <- factor(tourist$age6,levels = c(1,2,3,4,5,6),labels = c("below 20", "20-29","30-39","40-49", "50-59","above 60")) t3<-table(tourist$age6) t3 round(prop.table(t3)*100,2) ``` ## 2.2 crosstable and chi-square test ```{r} ### crosstable ct1<-table(tourist$thotel,tourist$gender) prop.table(ct1) prop.table(ct1,1) prop.table(ct1,2) round(prop.table(ct1,1)*100,2) # summary of chi-square test summary(ct1) ``` ```{r} # #transfer ct1 into data frame ct1b<-as.data.frame(ct1) # ct1c<-as.data.frame(round(prop.table(ct1,1)*100,2)) #combine ct1d<-cbind(ct1b,ct1c$Freq) #rename colnames(ct1d)<-c('hotel','gender','frequency','percent') print(ct1d) #export write.csv(ct1d,paste(datafolder,'tourist_hotel.csv',sep="")) ``` ```{r} ##### practice 08 : ##### is there significant relationship between income3 and thotel? ``` ## 2.3 independent T-test ```{r} t.test(tourist$sat1~tourist$gender) ``` ```{r} #tidy report ttest1<-t.test(tourist$sat1~tourist$gender) ttest12<-tidy(ttest1) ttest12 #export write.csv(ttest12,paste(datafolder,'sat1_gender.csv',sep="")) ``` ```{r} ##### practice 09 : ##### H3 female is more satisfied with scenery than male ``` ## 2.4 ANOVA ```{r} fit<-aov(sat1~income3,tourist) summary(fit) tidy(fit) ``` ## 2.5 correlation ```{r} #scatter chart plot(tourist$income,tourist$expense) ``` ```{r} cor.test(tourist$income,tourist$expense) cor(tourist$income,tourist$expense,method="spearman") ``` tidy report ```{r} cor_income_expense<-cor.test(tourist$income,tourist$expense) cor_income_expense<-tidy(cor_income_expense) cor_income_expense #export write.csv(cor_income_expense,paste(datafolder,'cor_income_expense.csv',sep="")) ``` correlation matrix ```{r} cor_data<-tourist[,c(5,6,10:15)] mydata.cor = cor(cor_data) mydata.cor ``` ```{r} #rcorr is from the Hmisc package mydata.rcorr <- rcorr(as.matrix(cor_data)) mydata.coeff <- mydata.rcorr$r mydata.coeff mydata.p <- mydata.rcorr$P mydata.p ``` ```{r} tidy(mydata.rcorr) #export write.csv(tidy(mydata.rcorr),paste(datafolder,'correlation_matrix.csv',sep="")) ``` chart of correlation matrix ```{r} corrplot(mydata.cor) ``` ## 3.1 ggplot2 ###plot ```{r} qplot(tourist$income,tourist$expense) ``` ```{r} ggplot(tourist,aes(x=income,y=expense))+ geom_point(size=1) ``` ```{r} ggplot(tourist,aes(x=income,y=expense))+ geom_point(size=2,aes(colour=gender)) ``` ```{r} ggplot(tourist,aes(x=income,y=expense))+ geom_point(size=2,aes(colour=thotel)) ``` ```{r} ggplot(tourist,aes(x=income,y=expense))+ geom_point(size=2)+ facet_grid(type3 ~.) ``` ```{r} ggplot(tourist,aes(x=income,y=expense))+ geom_point(size=2,aes(colour=type3))+ facet_grid(type3 ~.) ``` ```{r} ggplot(tourist,aes(x=income,y=expense))+ geom_point(size=2,aes(colour=type3))+ facet_grid(type3 ~type2) ``` ```{r} ggplot(tourist,aes(x=sat2,fill=gender))+ geom_histogram(binwidth=1,colour="white")+ facet_grid(gender ~.) ``` ```{r} ggplot(tourist,aes(x=sat2,fill=type3))+ geom_histogram(binwidth=1,colour="white")+ facet_grid(type2 ~type3) ``` # 4 map leaflet Leaflet is the leading open-source JavaScript library for mobile-friendly interactive maps. Weighing just about 42 KB of JS, it has all the mapping features most developers ever need. https://leafletjs.com/ openstreet map https://www.openstreetmap.org/ ```{r} center_lon = median(tourist$longitude,na.rm = TRUE) center_lat = median(tourist$latitude,na.rm = TRUE) leaflet(tourist) %>% addProviderTiles("Esri.NatGeoWorldMap") %>% addCircles(lng = ~longitude, lat = ~latitude,radius = ~sqrt(income/100)) %>% # center,zoom level setView(lng=center_lon, lat=center_lat,zoom = 10) ``` # 5 IPA (importance-performance analysis) Liu, X., & Zhang, N. (2020). Research on Customer Satisfaction of Budget Hotels Based on Revised IPA and Online Reviews. Science Journal of Business and Management, 8(2), 50–56. https://doi.org/10.11648/j.sjbm.20200802.11 https://www.sciencepublishinggroup.com/journal/paperinfo?journalid=175&doi=10.11648/j.sjbm.20200802.11 ## 5.1 data wrangling ```{r} review_label<-as.data.frame(c('amount', 'publishing date', 'relevance', 'positive', 'credibility')) colnames(review_label)<-c("label") tourist1<-tourist[,c("ri1","ri2","ri3","ri4","ri5","rp1","rp2","rp3","rp4","rp5")] hdata<-colMeans(tourist1) im<-tourist[,c("ri1","ri2","ri3","ri4","ri5")] pe<-tourist[,c("rp1","rp2","rp3","rp4","rp5")] imdata<-as.data.frame(colMeans(im)) colnames(imdata)<-c("importance") pedata<-as.data.frame(colMeans(pe)) colnames(pedata)<-c("performance") ipa<-cbind(imdata,pedata,review_label) ``` #5.2 compute means for IPA ```{r} print("------------importance----------") print(paste("min :",round(min(ipa[1]),2))) print(paste("max :",round(max(ipa[1]),2))) print("------------performance----------") print(paste("min :",round(min(ipa[2]),2))) print(paste("max :",round(max(ipa[2]),2))) print(colMeans(ipa[,1:2])) ``` ## 5.3 IPA chart ```{r} #2 basic of ipa_chart<-ggplot(ipa,aes(x=performance,y=importance,))+ geom_point(size=2)+ xlim(4.2, 4.8)+ylim(4.2, 4.8)+ labs(x = "performance",y="importance")+ geom_vline(xintercept=4.516,color = "red",linetype="dashed")+ geom_hline(yintercept=4.555,color = "red",linetype="dashed")+ geom_abline(intercept = 0, slope = 1, color="blue",size=0.5)+ geom_text_repel(aes(label=label),size=4,hjust=0.3,vjust=0.3) ipa_chart ggsave(paste(datafolder,'ipa_chart.png',sep=""),ipa_chart, width = 7) ``` # 6.EFA(explortary factor analysis) Chapman, C., & Feit, E. M. (2019). R For Marketing Research and Analytics (2nd ). Springer. https://link.springer.com/book/10.1007%2F978-3-030-14316-9 https://link.springer.com/book/10.1007/978-3-030-14316-9 ```{r} #load(url('https://od.lk/d/170336656_2dwPO/tourist_EN.rda')) load(paste(datafolder,'tourist_EN.rda',sep="")) ``` ## 6.1 datasets ```{r} satdata<-tourist[,c(10:15)] idata<-tourist[,c(16:20)] pdata<-tourist[,c(21:25)] tmdata<-tourist[,c(26:33)] zhdata<-tourist[,c(34:40)] #zhdata<-tourist[,c('zh1','zh2','zh3','zh4','zh5','zh6','zh7')] head(satdata) head(idata) head(pdata) head(tmdata) head(zhdata) ``` ## 6.2 kmo test https://www.statisticshowto.com/kaiser-meyer-olkin/ KMO(Kaiser-Meyer-Olkin) The Kaiser-Meyer-Olkin (KMO) Test is a measure of how suited your data is for Factor Analysis. The test measures sampling adequacy for each variable in the model and for the complete model. KMO returns values between 0 and 1. KMO values between 0.8 and 1 indicate the sampling is adequate. ```{r} kmo_test<-KMO(zhdata) print(kmo_test, stats = c("both", "MSA", "KMO"), vars = "all", sort = FALSE, show = "all", digits = 3) ``` ## 6.3 Bartlett’s test https://www.statology.org/bartletts-test-of-sphericity/ Bartlett’s test of sphericity Bartlett’s Test of Sphericity compares an observed correlation matrix to the identity matrix. Essentially it checks to see if there is a certain redundancy between the variables that we can summarize with a few number of factors. If the p-value from Bartlett’s Test of Sphericity is lower than our chosen significance level (common choices are 0.10, 0.05, and 0.01), then our dataset is suitable for a data reduction technique. ```{r} cortest.bartlett(zhdata) ``` ## 6.4 estimate the number of factors Chapman, C., & Feit, E. M. (2019). R For Marketing Research and Analytics. Springer. chapter 8.p206-213 https://link.springer.com/book/10.1007%2F978-3-030-14316-9 ```{r} parallel <- fa.parallel(zhdata, fm = 'minres', fa = 'both',n.iter=100) ``` ## 6.5 attrive 2 factors ```{r} efa2 <- fa(zhdata,nfactors = 2,rotate = "oblimin",fm="minres") print(efa2) print(efa2$loadings,cutoff = 0.5) ``` ## 6.6 chart ```{r} factor.plot(efa2,labels=rownames(efa2$loadings)) ``` ```{r} #heatmap.2(efa2$loadings, # col=brewer.pal(9, "Greens"), trace="none", key=FALSE, dend="none", # Colv=FALSE, cexCol = 1.2) ``` ```{r} fa.diagram(efa2) #fa.diagram(efa4plog,simple=FALSE) ``` # 7.CFA, Confirmatory Factor Analysis MR1 MR2 zh1 0.821 zh2 0.911 zh3 0.875 zh4 0.733 zh5 0.737 zh6 0.868 zh7 0.910 ## 7.1 model setting ```{r} cfa_model <- " D1 =~ zh1 + zh2 + zh3 + zh4 D2 =~ zh5 + zh6 + zh7 " ``` ## 7.2 model estimation ```{r} cfa.fit <- cfa(cfa_model, data=zhdata) summary(cfa.fit, fit.measures=TRUE) ``` ## 7.3 chart ```{r} semPaths(cfa.fit, what="est", fade=FALSE, residuals=FALSE,edge.label.cex=0.75) ``` # 8.reliability analysis ```{r} zh_d1<-tourist[,c('zh1','zh2','zh3','zh4')] alpha(zh_d1) ``` ```{r} zh_d2<-tourist[,c('zh5','zh6','zh7')] alpha(zh_d2) ```