# --------------------------
# Test Preparation Solutions
# (with feedback comments)
# R script
# --------------------------
# ----------
# Question 1
# ----------
# (code provided)
thuesen = as_tibble(thuesen)
model = lm(short.velocity~blood.glucose, data=thuesen)
# (a) Look at the output
# Call:
# lm(formula = short.velocity ~ blood.glucose, data = thuesen)
# Residuals:
# Min 1Q Median 3Q Max
# -0.40141 -0.14760 -0.02202 0.03001 0.43490
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.09781 0.11748 9.345 6.26e-09 ***
# blood.glucose 0.02196 0.01045 2.101 0.0479 *
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Residual standard error: 0.2167 on 21 degrees of freedom
# (1 observation deleted due to missingness)
# Multiple R-squared: 0.1737, Adjusted R-squared: 0.1343
# F-statistic: 4.414 on 1 and 21 DF, p-value: 0.0479
# * By looking at the Coefficients section of the output
# above, the line of best fit is:
# short.velocity = 1.09781 + 0.02196 * blood.glucose
# * The p-value corresponding to the blood.glucose
# coefficient, i.e., Pr(>|t|) in the Coefficients
# section of the output above, is 0.0479 < 0.05
# so we say that "blood.glucose is a significant
# predictor of short.velocity", i.e., we would
# definitely keep it in the model.
# (b) Scatterplot code
ggplot(thuesen, aes(x=blood.glucose,y=short.velocity)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
# thuesen is the dataset
# aes() sets the x-axis to be blood.glucose
# and the y-axis to be short.velocity
# geom_point() draws the scatterplot of dots
# geom_smooth() draws the line of best fit
# (the linear model or "lm")
# (c) Mean of the residuals
# ... is ALWAYS zero
# ... OR modify the code provided
# with output:
# [1] -3.613222e-18
# ... which is scientific notation for
# 3.613222 with the decimal place shifted 18 places to the left
# i.e. 0.000000000000000003613222
# Note that summary(model) gives the median and
# quartiles of the residuals but not mean
# (d) Diagnostic plots (code provided)
# Residuals vs Fitted values: ok, blue line approx horizontal
# no nonlinear curve or funneling
# Normal Q-Q: the upper right points are a concern as they do not
# lie on the dashed line
# Scale-Location: ok
# Residuals vs Leverage: some outliers
# (e) 95% confidence interval for correlation coefficient
# need to use cor.test() not just cor()
# or
# or
x = thuesen$blood.glucose
y = thuesen$short.velocity
cor(x,y) # gives NA due to some missing values
# or
x = pull(thuesen,blood.glucose)
y = pull(thuesen,short.velocity)
cor(x,y) # gives NA due to some missing values
# In each case cor.test() gives
# 95 percent confidence interval:
# 0.005496682 0.707429479
# This confidence interval DOES NOT include zero, so we can conclude
# that in the population blood.glucose and short.velocity are
# positively correlated
# (but very unsure whether weak/moderate/strong)
# ----------
# Question 2
# ----------
# (code provided)
# (a) Barchart
ggplot(msleep,aes(x=vore)) +
# Herbivore (herbi) is the most frequent vore
# msleep is the dataset
# aes() sets the x-axis to be vore
# and there is NO y-axis (this will be
# the counts)
# geom_bar() builds a frequency table and
# draws the bars
# you could do a horizontal barchart using
# coord_flip()
# (b) Boxplot
ggplot(msleep,aes(y=sleep_cycle)) +
# Warning! Need some care with boxplot
# -- if there is only one then use y=... (there is only
# one thing on the x-axis)
# -- if side-by-side boxplots then use
# x=categorical, y=quantitative
# -- to switch to horizontal use coord_flip()
# The boxplot shows TWO outliers (dots)
# Calculate the upper fence: UQ+1.5*IQR
# -- DO NOT guesstimate from the plot
upper_fence = 0.5792 + 1.5*(0.5792-0.3333)
# [1] 0.94805
# or
# or
# gives
# [1] 0.1166667 0.1166667 0.1333333 0.1500000 0.1500000 0.1666667 0.1833333
# [8] 0.1833333 0.1833333 0.2000000 0.2000000 0.2166667 0.2166667 0.2333333
# [15] 0.2833333 0.3333333 0.3333333 0.3500000 0.3833333 0.3833333 0.4166667
# [22] 0.4166667 0.5000000 0.5500000 0.6666667 0.6666667 0.7500000 0.7666667
# [29] 0.9000000 1.0000000 1.4166667 1.5000000
# so the third from highest value is 1.0 (which is also the top of
# the whisker in the boxplot)
# Accept as answer either 1.0 or 0.94805
# (c) Pipe, i.e., using %>%
msleep %>%
filter(sleep_total>=16,sleep_total<=18) %>%
select(name) %>%
# (d) Pipe to build summary table
msleep %>%
group_by(conservation) %>%
# (e) Pipe including join
conservation_table = tribble(
~conservation, ~description,
"cd", "Conservation Dependent",
"domesticated", "Domesticated",
"en", "Endangered",
"lc", "Least Concern",
"nt", "Near Threatened",
"vu", "Vulnerable"
# both tables have a column/variable called conservation
# so we can use a join
msleep %>%
group_by(conservation) %>%
summarise(count=n(),mean_bodywt=mean(bodywt)) %>%
filter(mean_bodywt==max(mean_bodywt)) %>%
left_join(conservation_table) %>%
# with output ...
# # A tibble: 1 x 1
# description#
# <chr>
# 1 Vulnerable
# Question 3
# Answers are the R commands in each case
# X~N(16,2^2), find P(X>20)
# mean is 16 and standard deviation is 2
# syntax: pnorm(x,mean,standard_deviation)
pnorm(70,75,12) - pnorm(60,75,12)
# [1] 0.02275013
# (b) Find x so that P(X<x)=0.2
# [1] 14.31676
# (c) 1kg=1000g
# total weight of box T follows a normal distribution
# with mean 66*16 (remember to add means)
# and variance 66*2^2 (remember to add variances)
# so standard deviation is sqrt(66*4)
# [1] 0.0002838844
# comment: very tiny chance of box having < 1kg
# -- the end --
diamonds %>%
filter() %>%
select(color) %>%
ggplot(diamonds,aes(x=carat)) +
diamonds %>%
group_by(cut) %>%
summarise(number_diamond=n(), cor(carat, price))
loading = c(3,8,10,11,13,16,27,30,35,37,38,44,103,142)
removal = c(4,7,8,8,10,16,26,21,9,31,30,75,90)
wetland = tibble(loading,removal)
ggplot(wetland, aes(x=loading,y=removal)) +
geom_point() +
geom_smooth(method='lm', se=FALSE)