--- title: "Untitled" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## R Markdown This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ```{r cars} summary(cars) ``` ## Including Plots You can also embed plots, for example: ```{r pressure, echo=FALSE} titanic.train <- read.csv("data/train.csv", stringsAsFactors = F) ``` ```{r pressure, echo=FALSE} titanic.test <- read.csv("data/test.csv", stringsAsFactors = F) ``` Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. ```{r} str(titanic.train) ``` ```{r} str(titanic.test) ``` ```{r} summary(titanic.test) ``` ```{r} summary(titanic.train) ``` ```{r} length(which(titanic.train$Cabin=="")) ``` ```{r} length(which(titanic.test$Cabin=="")) ``` ```{r} train.class1.no.cabin <- which(titanic.train$Pclass==1 & titanic.train$Cabin=="") length(train.class1.no.cabin) ``` ```{r} test.class1.no.cabin <- which(titanic.test$Pclass==1 & titanic.test$Cabin=="") length(test.class1.no.cabin) ``` ```{r} titanic.train$Cabin[train.class1.no.cabin] <- NA titanic.test$Cabin[test.class1.no.cabin] <- NA ``` ```{r} length(which(is.na(titanic.train$Cabin))) ``` ```{r} length(which(is.na(titanic.test$Cabin))) ``` ```{r} apply(X = titanic.train[,c("Name","Sex","Ticket","Embarked")], MARGIN = 2, FUN = function(x) length(which(x == ""))) ``` ```{r} apply(X = titanic.test[,c("Name","Sex","Ticket","Embarked")], MARGIN = 2, FUN = function(x) length(which(x == ""))) ``` ```{r} titanic.train$Embarked[titanic.train$Embarked ==""] <- NA ``` ```{r} unique(titanic.train$Embarked) ``` ```{r} unique(titanic.test$Embarked) ``` ```{r} titanic.train$Embarked[is.na(titanic.train$Embarked)] <- 'S' ``` ```{r} xtabs(~Embarked, data = titanic.train) ``` ```{r} titanic.train$Embarked <- factor(titanic.train$Embarked) titanic.test$Embarked <- factor(titanic.test$Embarked) ``` ```{r} shapiro.test(titanic.test$Fare) ``` ```{r} missing.fare.pclass <- titanic.test$Pclass[is.na(titanic.test$Fare)] ``` ```{r} median.fare <- median(x = titanic.test$Fare[titanic.test$Pclass == missing.fare.pclass], na.rm = T) ``` ```{r} titanic.test$Fare[is.na(titanic.test$Fare)] <- median.fare ``` ```{r} summary(titanic.test$Fare) ``` ```{r} titanic.train$Sex <- factor(titanic.train$Sex) ``` ```{r} summary(titanic.train$Sex) ``` ```{r} prop.table(summary(titanic.train$Sex)) ``` ```{r} sex.survived.counts <- xtabs(~Sex + Survived, data = titanic.train) sex.survived.counts ``` ```{r} sex.surv.tbl <- prop.table(sex.survived.counts, margin = 1) sex.surv.tbl ``` ```{r} titanic.train$Survived <- factor(titanic.train$Survived, levels = c(0,1), labels = c('No', 'Yes')) titanic.train$Pclass <- factor(titanic.train$Pclass, levels = c(1,2,3), labels = c('1st', '2nd', '3rd')) ``` ```{r} gp1 <- ggplot(titanic.train, aes(x = Pclass, fill=Survived)) + geom_bar(position = "dodge", width = 0.4) + ylab("Number of passengers") + xlab("Passenger class") + theme_bw() gp1 ``` ```{r} gp2 <- gp1 + facet_wrap(~Sex) gp2 ``` ```{r} gp3 <- ggplot(titanic.train, aes(x = Embarked, fill=Survived)) + geom_bar(position = "dodge", width = 0.45) + ylab("Number of passengers") + xlab("Place of embarkment") + theme_bw() gp3 ``` ```{r} # add the Survived variable to the test set titanic.test$Survived <- factor(NA, levels = c(1,2), labels = c("No", "Yes")) # transform the Pclass variable into factor (in the test set) titanic.test$Pclass <- factor(x = titanic.test$Pclass, levels = c(1,2,3), labels = c("1st", "2nd", "3rd")) # transform the Set variable into factor (in the test set) titanic.test$Sex <- factor(titanic.test$Sex) # transform the Embarked variable into factor (in the test set) titanic.test$Embarked <- factor(titanic.test$Embarked) ``` ```{r} # merge train and test sets titanic.all <- rbind(titanic.train, titanic.test) ```