test prep.R


# --------------------------
# Test Preparation Solutions
# (with feedback comments)
# R script
# --------------------------

# ----------
# Question 1
# ----------

# (code provided)
install.packages("ISwR")
library(tidyverse)
library(ISwR)
thuesen = as_tibble(thuesen)
model = lm(short.velocity~blood.glucose, data=thuesen)
summary(model)

# (a) Look at the output
#
# Call:
#   lm(formula = short.velocity ~ blood.glucose, data = thuesen)
#
# Residuals:
#   Min       1Q   Median       3Q      Max
# -0.40141 -0.14760 -0.02202  0.03001  0.43490
#
# Coefficients:
#   Estimate Std. Error t value Pr(>|t|)
# (Intercept)    1.09781    0.11748   9.345 6.26e-09 ***
#   blood.glucose  0.02196    0.01045   2.101   0.0479 *
#   ---
#   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 0.2167 on 21 degrees of freedom
# (1 observation deleted due to missingness)
# Multiple R-squared:  0.1737,	Adjusted R-squared:  0.1343
# F-statistic: 4.414 on 1 and 21 DF,  p-value: 0.0479
#

# * By looking at the Coefficients section of the output
# above, the line of best fit is:
#   short.velocity = 1.09781 + 0.02196 * blood.glucose

# * The p-value corresponding to the blood.glucose
# coefficient, i.e., Pr(>|t|) in the Coefficients
# section of the output above, is 0.0479 < 0.05
# so we say that "blood.glucose is a significant
# predictor of short.velocity", i.e., we would
# definitely keep it in the model.


# (b) Scatterplot code
ggplot(thuesen, aes(x=blood.glucose,y=short.velocity)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE)
# thuesen is the dataset
# aes() sets the x-axis to be blood.glucose
#        and the y-axis to be short.velocity
# geom_point() draws the scatterplot of dots
# geom_smooth() draws the line of best fit
#     (the linear model or "lm")


# (c) Mean of the residuals
#     ... is ALWAYS zero
#     ... OR modify the code provided
library(broom)
mean(pull(augment(model),.resid))
# with output:
# [1] -3.613222e-18
# ... which is scientific notation for
# 3.613222 with the decimal place shifted 18 places to the left
# i.e. 0.000000000000000003613222
# Note that summary(model) gives the median and
# quartiles of the residuals but not mean


# (d) Diagnostic plots (code provided)
library(ggfortify)
autoplot(model)
# Residuals vs Fitted values: ok, blue line approx horizontal
#                             no nonlinear curve or funneling
# Normal Q-Q: the upper right points are a concern as they do not
#             lie on the dashed line
# Scale-Location: ok
# Residuals vs Leverage: some outliers

# (e) 95% confidence interval for correlation coefficient
#     need to use cor.test() not just cor()
cor.test(thuesen$blood.glucose,thuesen$short.velocity)
# or
cor.test(pull(thuesen,blood.glucose),pull(thuesen,short.velocity))
# or
x = thuesen$blood.glucose
y = thuesen$short.velocity
cor(x,y)   # gives NA due to some missing values
cor.test(x,y)
# or
x = pull(thuesen,blood.glucose)
y = pull(thuesen,short.velocity)
cor(x,y)   # gives NA due to some missing values
cor.test(x,y)
# In each case cor.test() gives
# 95 percent confidence interval:
#  0.005496682 0.707429479
# This confidence interval DOES NOT include zero, so we can conclude
# that in the population blood.glucose and short.velocity are
# positively correlated
# (but very unsure whether weak/moderate/strong)


# ----------
# Question 2
# ----------

# (code provided)
library(tidyverse)
msleep

# (a) Barchart
ggplot(msleep,aes(x=vore)) +
  geom_bar()
# Herbivore (herbi) is the most frequent vore
# msleep is the dataset
# aes() sets the x-axis to be vore
#       and there is NO y-axis (this will be
#         the counts)
# geom_bar() builds a frequency table and
#       draws the bars
# you could do a horizontal barchart using
#   coord_flip()

# (b) Boxplot
ggplot(msleep,aes(y=sleep_cycle)) +
  geom_boxplot()
# Warning! Need some care with boxplot
# -- if there is only one then use y=... (there is only
#      one thing on the x-axis)
# -- if side-by-side boxplots then use
#      x=categorical, y=quantitative
# -- to switch to horizontal use coord_flip()

# The boxplot shows TWO outliers (dots)
# Calculate the upper fence: UQ+1.5*IQR
# -- DO NOT guesstimate from the plot
upper_fence = 0.5792 + 1.5*(0.5792-0.3333)
upper_fence
# [1] 0.94805
# or
sort(msleep$sleep_cycle)
# or
sort(pull(msleep,sleep_cycle))
# gives
# [1] 0.1166667 0.1166667 0.1333333 0.1500000 0.1500000 0.1666667 0.1833333
# [8] 0.1833333 0.1833333 0.2000000 0.2000000 0.2166667 0.2166667 0.2333333
# [15] 0.2833333 0.3333333 0.3333333 0.3500000 0.3833333 0.3833333 0.4166667
# [22] 0.4166667 0.5000000 0.5500000 0.6666667 0.6666667 0.7500000 0.7666667
# [29] 0.9000000 1.0000000 1.4166667 1.5000000
# so the third from highest value is 1.0 (which is also the top of
#   the whisker in the boxplot)
# Accept as answer either 1.0 or 0.94805


# (c) Pipe, i.e., using %>%
msleep %>%
  filter(sleep_total>=16,sleep_total<=18) %>%
  select(name) %>%
  arrange(name)

# (d) Pipe to build summary table
msleep %>%
  group_by(conservation) %>%
  summarise(count=n(),mean_bodywt=mean(bodywt))


# (e) Pipe including join
conservation_table = tribble(
  ~conservation, ~description,
  "cd", "Conservation Dependent",
  "domesticated", "Domesticated",
  "en", "Endangered",
  "lc", "Least Concern",
  "nt", "Near Threatened",
  "vu", "Vulnerable"
)
# both tables have a column/variable called conservation
# so we can use a join
msleep %>%
  group_by(conservation) %>%
  summarise(count=n(),mean_bodywt=mean(bodywt)) %>%
  filter(mean_bodywt==max(mean_bodywt)) %>%
  left_join(conservation_table) %>%
  select(description)
# with output ...
# # A tibble: 1 x 1
# description#
# <chr>
#   1 Vulnerable


# Question 3
# Answers are the R commands in each case

#     X~N(16,2^2), find P(X>20)
#     mean is 16 and standard deviation is 2
#     syntax: pnorm(x,mean,standard_deviation)
pnorm(70,75,12) - pnorm(60,75,12)
# [1] 0.02275013

# (b) Find x so that P(X<x)=0.2
qnorm(0.85,74,12)
# [1] 14.31676

# (c) 1kg=1000g
#     total weight of box T follows a normal distribution
#     with mean 66*16 (remember to add means)
#     and variance 66*2^2 (remember to add variances)
#     so standard deviation is sqrt(66*4)
pnorm(1000,66*16,sqrt(66*4))
# [1] 0.0002838844
# comment: very tiny chance of box having < 1kg

# -- the end --

library(tidyverse)
summary(diamonds)


diamonds %>%
  filter() %>%
  select(color) %>%

ggplot(diamonds,aes(x=carat)) +
  geom_histogram(binwidth=0.02)

diamonds %>%
  group_by(cut) %>%
  summarise(number_diamond=n(), cor(carat, price))

library(tidyverse)
library(ggplot2)
loading = c(3,8,10,11,13,16,27,30,35,37,38,44,103,142)
removal = c(4,7,8,8,10,16,26,21,9,31,30,75,90)
wetland = tibble(loading,removal)
ggplot(wetland, aes(x=loading,y=removal)) +
  geom_point() +
  geom_smooth(method='lm', se=FALSE)

	# --------------------------
	# Test Preparation Solutions
	# (with feedback comments)
	# R script
	# --------------------------

	# ----------
	# Question 1
	# ----------

	# (code provided)
	install.packages("ISwR")
	library(tidyverse)
	library(ISwR)
	thuesen = as_tibble(thuesen)
	model = lm(short.velocity~blood.glucose, data=thuesen)
	summary(model)

	# (a) Look at the output
	#
	# Call:
	# lm(formula = short.velocity ~ blood.glucose, data = thuesen)
	#
	# Residuals:
	# Min 1Q Median 3Q Max
	# -0.40141 -0.14760 -0.02202 0.03001 0.43490
	#
	# Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 1.09781 0.11748 9.345 6.26e-09 ***
	# blood.glucose 0.02196 0.01045 2.101 0.0479 *
	# ---
	# Signif. codes: 0 '*' 0.001 '' 0.01 '*' 0.05 '.' 0.1 ' ' 1
	#
	# Residual standard error: 0.2167 on 21 degrees of freedom
	# (1 observation deleted due to missingness)
	# Multiple R-squared: 0.1737, Adjusted R-squared: 0.1343
	# F-statistic: 4.414 on 1 and 21 DF, p-value: 0.0479
	#

	# * By looking at the Coefficients section of the output
	# above, the line of best fit is:
	# short.velocity = 1.09781 + 0.02196 * blood.glucose

	# * The p-value corresponding to the blood.glucose
	# coefficient, i.e., Pr(>\|t\|) in the Coefficients
	# section of the output above, is 0.0479 < 0.05
	# so we say that "blood.glucose is a significant
	# predictor of short.velocity", i.e., we would
	# definitely keep it in the model.


	# (b) Scatterplot code
	ggplot(thuesen, aes(x=blood.glucose,y=short.velocity)) +
	geom_point() +
	geom_smooth(method="lm", se=FALSE)
	# thuesen is the dataset
	# aes() sets the x-axis to be blood.glucose
	# and the y-axis to be short.velocity
	# geom_point() draws the scatterplot of dots
	# geom_smooth() draws the line of best fit
	# (the linear model or "lm")


	# (c) Mean of the residuals
	# ... is ALWAYS zero
	# ... OR modify the code provided
	library(broom)
	mean(pull(augment(model),.resid))
	# with output:
	# [1] -3.613222e-18
	# ... which is scientific notation for
	# 3.613222 with the decimal place shifted 18 places to the left
	# i.e. 0.000000000000000003613222
	# Note that summary(model) gives the median and
	# quartiles of the residuals but not mean


	# (d) Diagnostic plots (code provided)
	library(ggfortify)
	autoplot(model)
	# Residuals vs Fitted values: ok, blue line approx horizontal
	# no nonlinear curve or funneling
	# Normal Q-Q: the upper right points are a concern as they do not
	# lie on the dashed line
	# Scale-Location: ok
	# Residuals vs Leverage: some outliers

	# (e) 95% confidence interval for correlation coefficient
	# need to use cor.test() not just cor()
	cor.test(thuesen$blood.glucose,thuesen$short.velocity)
	# or
	cor.test(pull(thuesen,blood.glucose),pull(thuesen,short.velocity))
	# or
	x = thuesen$blood.glucose
	y = thuesen$short.velocity
	cor(x,y) # gives NA due to some missing values
	cor.test(x,y)
	# or
	x = pull(thuesen,blood.glucose)
	y = pull(thuesen,short.velocity)
	cor(x,y) # gives NA due to some missing values
	cor.test(x,y)
	# In each case cor.test() gives
	# 95 percent confidence interval:
	# 0.005496682 0.707429479
	# This confidence interval DOES NOT include zero, so we can conclude
	# that in the population blood.glucose and short.velocity are
	# positively correlated
	# (but very unsure whether weak/moderate/strong)


	# ----------
	# Question 2
	# ----------

	# (code provided)
	library(tidyverse)
	msleep

	# (a) Barchart
	ggplot(msleep,aes(x=vore)) +
	geom_bar()
	# Herbivore (herbi) is the most frequent vore
	# msleep is the dataset
	# aes() sets the x-axis to be vore
	# and there is NO y-axis (this will be
	# the counts)
	# geom_bar() builds a frequency table and
	# draws the bars
	# you could do a horizontal barchart using
	# coord_flip()

	# (b) Boxplot
	ggplot(msleep,aes(y=sleep_cycle)) +
	geom_boxplot()
	# Warning! Need some care with boxplot
	# -- if there is only one then use y=... (there is only
	# one thing on the x-axis)
	# -- if side-by-side boxplots then use
	# x=categorical, y=quantitative
	# -- to switch to horizontal use coord_flip()

	# The boxplot shows TWO outliers (dots)
	# Calculate the upper fence: UQ+1.5*IQR
	# -- DO NOT guesstimate from the plot
	upper_fence = 0.5792 + 1.5*(0.5792-0.3333)
	upper_fence
	# [1] 0.94805
	# or
	sort(msleep$sleep_cycle)
	# or
	sort(pull(msleep,sleep_cycle))
	# gives
	# [1] 0.1166667 0.1166667 0.1333333 0.1500000 0.1500000 0.1666667 0.1833333
	# [8] 0.1833333 0.1833333 0.2000000 0.2000000 0.2166667 0.2166667 0.2333333
	# [15] 0.2833333 0.3333333 0.3333333 0.3500000 0.3833333 0.3833333 0.4166667
	# [22] 0.4166667 0.5000000 0.5500000 0.6666667 0.6666667 0.7500000 0.7666667
	# [29] 0.9000000 1.0000000 1.4166667 1.5000000
	# so the third from highest value is 1.0 (which is also the top of
	# the whisker in the boxplot)
	# Accept as answer either 1.0 or 0.94805


	# (c) Pipe, i.e., using %>%
	msleep %>%
	filter(sleep_total>=16,sleep_total<=18) %>%
	select(name) %>%
	arrange(name)

	# (d) Pipe to build summary table
	msleep %>%
	group_by(conservation) %>%
	summarise(count=n(),mean_bodywt=mean(bodywt))


	# (e) Pipe including join
	conservation_table = tribble(
	~conservation, ~description,
	"cd", "Conservation Dependent",
	"domesticated", "Domesticated",
	"en", "Endangered",
	"lc", "Least Concern",
	"nt", "Near Threatened",
	"vu", "Vulnerable"
	)
	# both tables have a column/variable called conservation
	# so we can use a join
	msleep %>%
	group_by(conservation) %>%
	summarise(count=n(),mean_bodywt=mean(bodywt)) %>%
	filter(mean_bodywt==max(mean_bodywt)) %>%
	left_join(conservation_table) %>%
	select(description)
	# with output ...
	# # A tibble: 1 x 1
	# description#
	# <chr>
	# 1 Vulnerable



	# Question 3
	# Answers are the R commands in each case

	# X~N(16,2^2), find P(X>20)
	# mean is 16 and standard deviation is 2
	# syntax: pnorm(x,mean,standard_deviation)
	pnorm(70,75,12) - pnorm(60,75,12)
	# [1] 0.02275013

	# (b) Find x so that P(X<x)=0.2
	qnorm(0.85,74,12)
	# [1] 14.31676

	# (c) 1kg=1000g
	# total weight of box T follows a normal distribution
	# with mean 66*16 (remember to add means)
	# and variance 66*2^2 (remember to add variances)
	# so standard deviation is sqrt(66*4)
	pnorm(1000,6616,sqrt(664))
	# [1] 0.0002838844
	# comment: very tiny chance of box having < 1kg

	# -- the end --

	library(tidyverse)
	summary(diamonds)


	diamonds %>%
	filter() %>%
	select(color) %>%

	ggplot(diamonds,aes(x=carat)) +
	geom_histogram(binwidth=0.02)

	diamonds %>%
	group_by(cut) %>%
	summarise(number_diamond=n(), cor(carat, price))

	library(tidyverse)
	library(ggplot2)
	loading = c(3,8,10,11,13,16,27,30,35,37,38,44,103,142)
	removal = c(4,7,8,8,10,16,26,21,9,31,30,75,90)
	wetland = tibble(loading,removal)
	ggplot(wetland, aes(x=loading,y=removal)) +
	geom_point() +
	geom_smooth(method='lm', se=FALSE)