---
title: "Untitled"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see .
When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
```{r cars}
summary(cars)
```
## Including Plots
You can also embed plots, for example:
```{r pressure, echo=FALSE}
titanic.train <- read.csv("data/train.csv", stringsAsFactors = F)
```
```{r pressure, echo=FALSE}
titanic.test <- read.csv("data/test.csv", stringsAsFactors = F)
```
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
```{r}
str(titanic.train)
```
```{r}
str(titanic.test)
```
```{r}
summary(titanic.test)
```
```{r}
summary(titanic.train)
```
```{r}
length(which(titanic.train$Cabin==""))
```
```{r}
length(which(titanic.test$Cabin==""))
```
```{r}
train.class1.no.cabin <- which(titanic.train$Pclass==1 & titanic.train$Cabin=="")
length(train.class1.no.cabin)
```
```{r}
test.class1.no.cabin <- which(titanic.test$Pclass==1 & titanic.test$Cabin=="")
length(test.class1.no.cabin)
```
```{r}
titanic.train$Cabin[train.class1.no.cabin] <- NA
titanic.test$Cabin[test.class1.no.cabin] <- NA
```
```{r}
length(which(is.na(titanic.train$Cabin)))
```
```{r}
length(which(is.na(titanic.test$Cabin)))
```
```{r}
apply(X = titanic.train[,c("Name","Sex","Ticket","Embarked")],
MARGIN = 2,
FUN = function(x) length(which(x == "")))
```
```{r}
apply(X = titanic.test[,c("Name","Sex","Ticket","Embarked")],
MARGIN = 2,
FUN = function(x) length(which(x == "")))
```
```{r}
titanic.train$Embarked[titanic.train$Embarked ==""] <- NA
```
```{r}
unique(titanic.train$Embarked)
```
```{r}
unique(titanic.test$Embarked)
```
```{r}
titanic.train$Embarked[is.na(titanic.train$Embarked)] <- 'S'
```
```{r}
xtabs(~Embarked, data = titanic.train)
```
```{r}
titanic.train$Embarked <- factor(titanic.train$Embarked)
titanic.test$Embarked <- factor(titanic.test$Embarked)
```
```{r}
shapiro.test(titanic.test$Fare)
```
```{r}
missing.fare.pclass <- titanic.test$Pclass[is.na(titanic.test$Fare)]
```
```{r}
median.fare <- median(x = titanic.test$Fare[titanic.test$Pclass == missing.fare.pclass],
na.rm = T)
```
```{r}
titanic.test$Fare[is.na(titanic.test$Fare)] <- median.fare
```
```{r}
summary(titanic.test$Fare)
```
```{r}
titanic.train$Sex <- factor(titanic.train$Sex)
```
```{r}
summary(titanic.train$Sex)
```
```{r}
prop.table(summary(titanic.train$Sex))
```
```{r}
sex.survived.counts <- xtabs(~Sex + Survived, data = titanic.train)
sex.survived.counts
```
```{r}
sex.surv.tbl <- prop.table(sex.survived.counts,
margin = 1)
sex.surv.tbl
```
```{r}
titanic.train$Survived <- factor(titanic.train$Survived,
levels = c(0,1), labels = c('No', 'Yes'))
titanic.train$Pclass <- factor(titanic.train$Pclass,
levels = c(1,2,3),
labels = c('1st', '2nd', '3rd'))
```
```{r}
gp1 <- ggplot(titanic.train, aes(x = Pclass, fill=Survived)) +
geom_bar(position = "dodge", width = 0.4) +
ylab("Number of passengers") +
xlab("Passenger class") +
theme_bw()
gp1
```
```{r}
gp2 <- gp1 + facet_wrap(~Sex)
gp2
```
```{r}
gp3 <- ggplot(titanic.train, aes(x = Embarked, fill=Survived)) +
geom_bar(position = "dodge", width = 0.45) +
ylab("Number of passengers") +
xlab("Place of embarkment") +
theme_bw()
gp3
```
```{r}
# add the Survived variable to the test set
titanic.test$Survived <- factor(NA, levels = c(1,2), labels = c("No", "Yes"))
# transform the Pclass variable into factor (in the test set)
titanic.test$Pclass <- factor(x = titanic.test$Pclass, levels = c(1,2,3), labels = c("1st", "2nd", "3rd"))
# transform the Set variable into factor (in the test set)
titanic.test$Sex <- factor(titanic.test$Sex)
# transform the Embarked variable into factor (in the test set)
titanic.test$Embarked <- factor(titanic.test$Embarked)
```
```{r}
# merge train and test sets
titanic.all <- rbind(titanic.train, titanic.test)
```