SOCI832: Lesson 12.1: Visualisation in R (with ggplot2 + sjPlot

1. Useful Resources
2. R Code

1. Useful Resources

2. R Code

# Update Packages
update.packages(ask = FALSE, repos='https://cran.csiro.au/', dependencies = TRUE)

# preparation
if (!require("sjlabelled")) install.packages("sjlabelled", dependencies = TRUE)
if (!require("sjmisc")) install.packages("sjmisc", dependencies = TRUE)
if (!require("sjPlot")) install.packages("sjPlot", dependencies = TRUE)
if (!require("dplyr")) install.packages("dplyr", dependencies = TRUE)
if (!require("ggrepel")) install.packages("ggrepel", dependencies = TRUE)
if (!require("Hmisc")) install.packages("Hmisc", dependencies = TRUE)
if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)
if (!require("ggthemes")) install.packages("ggthemes", dependencies = TRUE)
library(sjlabelled)
library(sjmisc) 
library(sjPlot)
library(dplyr)
library(ggrepel)
library(Hmisc)
library(ggplot2)
library(ggthemes)

# data
aus2012 <- readRDS(url("https://mqsociology.github.io/learn-r/soci832/aussa2012.RDS"))
lga <- readRDS(url("https://mqsociology.github.io/learn-r/soci832/nsw-lga-crime.RDS"))

# Turn off scientific notation
options(digits=3, scipen=8) 

# Stop View from overloading memory with a large datasets
RStudioView <- View
View <- function(x) {
  if ("data.frame" %in% class(x)) { RStudioView(x[1:500,]) } else { RStudioView(x) }
}

my_theme <-   theme_classic(base_size = 12)  +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, color="red", face="bold.italic"),    # Center title position and size
    plot.subtitle = element_text(hjust = 0.5),            # Center subtitle
    plot.caption = element_text(hjust = 0, face = "italic"), # move caption to the left
    axis.title.x = element_text(color="blue", size=14, face="bold"),
    axis.title.y = element_text(color="#993333", size=14, face="bold")
  )

################################################################################
# 1. Bar graph
################################################################################
# 1.1 Basic bar graph
plot_frq(aus2012$fechld, type = "bar")  + my_theme

plot_frq(aus2012$fechld, type = "bar", title = "Attitude toward working mom")  + my_theme  # Add the title

plot_frq(aus2012$fechld, type = "bar", title = "Attitude toward working mom",
        geom.colors = "darkolivegreen")  + my_theme  # Change bar color

plot_frq(aus2012$fechld, type = "bar", title = "Attitude toward working mom",
        show.values = FALSE)   + my_theme  # do not show values

plot_frq(aus2012$fechld, type = "bar", title = "Attitude toward working mom",
        show.prc = FALSE)  + my_theme  # do not show percentages

plot_frq(aus2012$fechld, 
        type = "bar", 
        title = "Attitude toward working mom",
        show.n = FALSE,
        coord.flip = TRUE) # do not show frequencies

# 1.2 Stacked bar graph
plot_stackfrq(aus2012$fechld)  + my_theme

plot_stackfrq(aus2012$fechld, coord.flip = FALSE) + my_theme # the x and y axis are swaped

plot_stackfrq(aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")])  + my_theme # plot many variables simultaneously

plot_stackfrq(aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")],
             title = "Attitude toward working women (or mom)") + theme_classic()# Add the title

# 1.3 Stacked bar graph for likert scale
plot_likert(aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")],
           cat.neutral = 3)  + my_theme  # 'cat.neutral' means a neutral category such as "neither agree nor disagree".

plot_likert(aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")],
           cat.neutral = 3, values = "hide") + my_theme # hide the percentage values on the bars.

plot_likert(aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")],
           cat.neutral = 3, title = "Attitude toward working women (or mom)") + my_theme  # Add the title

plot_likert(aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")],
           cat.neutral = 3, coord.flip = FALSE) + my_theme # the x and y axis are swaped; but you will see the variable name is so long; I will fix them in the next code.

plot_likert(aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")],
           cat.neutral = 3, coord.flip = FALSE, wrap.labels = 20) + my_theme # 'wrap.labels' determines how many characters are displayed in one line and when a line brek is inserted

## Also, I would like to tell you how to change variable names
wrkwm <- aus2012[, c("fechld", "fepresch", "famsuffr", "homekid", "housewrk")] # extract five variables
wrkwm$fechld <- set_label(wrkwm$fechld, label = "Q1a") # change the variable name
wrkwm$fepresch <- set_label(wrkwm$fepresch, label = "Q1b")
wrkwm$famsuffr <- set_label(wrkwm$famsuffr, label = "Q1c")
wrkwm$homekid <- set_label(wrkwm$homekid, label = "Q1d")
wrkwm$housewrk <- set_label(wrkwm$housewrk, label = "Q1e")

plot_likert(wrkwm, cat.neutral = 3)

## Note: if you want to change value labels, use 'set_labels()'.

# 1.4 Grouped bar plot
plot_xtab(aus2012$fechld, aus2012$sex)  + my_theme

plot_xtab(aus2012$fechld, aus2012$sex, show.total = FALSE) + my_theme # do not show the total.

plot_xtab(aus2012$sex, aus2012$fechld, bar.pos = "stack", show.total = FALSE)  + my_theme

plot_xtab(aus2012$sex, aus2012$fechld, bar.pos = "stack", show.total = FALSE,
         margin = "row", coord.flip = TRUE, show.n = FALSE) + my_theme # 'margin' specifies how percentages are calculated; 'row' indicates the row percentage; 'show.n = FALSE' will remove the total number of cases.

plot_xtab(aus2012$sex, aus2012$fechld, bar.pos = "stack", show.total = FALSE,
         margin = "row", coord.flip = TRUE, show.n = FALSE, show.summary = TRUE) + my_theme # show chi-square statistics

plot_xtab(aus2012$sex, aus2012$fechld, bar.pos = "stack", show.total = FALSE,
         margin = "row", coord.flip = TRUE, show.n = FALSE,
         legend.title = "", 
         title = "Q1a Workimg mom: warm relationship with children as a not working mom",
         wrap.title = 70) + my_theme # remove the legend title; add the plot title; allow more characters in one line in the plot title

plot_xtab(aus2012$sex, aus2012$fechld, bar.pos = "stack", show.total = FALSE,
         margin = "row", coord.flip = TRUE, show.n = FALSE,
         legend.title = "", 
         title = "Q1a Workimg mom: warm relationship with children as a not working mom",
         wrap.title = 70,
         geom.colors = c("burlywood4", "burlywood1", "darkgray", "darkolivegreen1", "darkolivegreen4")) + my_theme # change colors for each category

################################################################################
# 2. Box plot
################################################################################
# 2.1 Simple box plot
aus2012$rhhwork <- set_label(aus2012$rhhwork, label = "Hours spent on household work") # set variable name
plot_frq(aus2012$rhhwork, type = "box")  + my_theme

plot_frq(aus2012$rhhwork, type = "box", ylim = c(0, 50)) + my_theme # reduce the range of y-axis.

# 2.2 Grouped box plot
plot_grpfrq(aus2012$rhhwork, aus2012$sex, type = "box")  + my_theme

plot_grpfrq(aus2012$rhhwork, aus2012$sex, type = "box",  ylim = c(0, 50))  + my_theme

# 2.3 3-way grouped box plot
plot_grpfrq(aus2012$rhhwork, aus2012$sex, intr.var = aus2012$degree, type = "box") + my_theme # value labels are long; I will change them

aus2012$degree <- set_labels(aus2012$degree, 
                             labels = c("Did not complete Yr 10" = 1,
                                    "Completed Yr 10" = 2,
                                    "Completed Yr 12" = 3,
                                    "Trade qualification or Apprenticeship" = 4,
                                    "TAFE or business college" = 5,
                                    "Bachelor degree" = 6,
                                    "Postgraduate degree" = 7)) # reassign value labels
plot_grpfrq(aus2012$rhhwork, aus2012$sex, intr.var = aus2012$degree, type = "box",
           wrap.labels = 8, ylim = c(0, 50))  + my_theme

################################################################################
# 3. Histogram
################################################################################
plot_frq(aus2012$age, type = "hist", title = "Distribution of Respondents' Age")  + my_theme

plot_frq(aus2012$age, type = "hist", title = "Distribution of Respondents' Age",
        show.mean = TRUE) + my_theme # show mean and standard deviation

plot_frq(aus2012$age, type = "hist", title = "Distribution of Respondents' Age",
        show.mean = TRUE, normal.curve = TRUE) + my_theme # Add the normal disribution curve.

plot_frq(aus2012$age, type = "hist", title = "Distribution of Respondents' Age",
        show.mean = TRUE, normal.curve = TRUE, 
        normal.curve.color = "red", normal.curve.size = 3) + my_theme # change the colour and the line width of the normal curve

################################################################################
# 4. Scatter plot
################################################################################
aus2012$educyrs <- set_label(aus2012$educyrs, label = "Years of schooling")

# 4.1 Simple scatter plot
plot_scatter(aus2012, educyrs, rhhwork) + my_theme

plot_scatter(aus2012, educyrs, rhhwork, jitter = TRUE)+ my_theme # points are jittered to avoid overlapping.

plot_scatter(aus2012, educyrs, rhhwork, jitter = TRUE, fit.line = lm)+ my_theme # add the fitted line

# 4.2 Grouped scatter plot
plot_scatter(aus2012, educyrs, rhhwork, sex, jitter = TRUE)+ my_theme # different colors by gender

plot_scatter(aus2012, educyrs, rhhwork, sex, jitter = TRUE,
            fit.grps = lm)+ my_theme # add the fitted line for each group

plot_scatter(aus2012, educyrs, rhhwork, sex, jitter = TRUE,
            fit.grps = lm, show.ci = TRUE)+ my_theme # show confidence intervals

plot_scatter(aus2012, educyrs, rhhwork, sex, jitter = TRUE,
            fit.grps = lm, show.ci = TRUE, grid = TRUE)+ my_theme # grouped scatter plot as facets

plot_scatter(aus2012, educyrs, rhhwork, sex, jitter = TRUE,
            fit.grps = lm, show.ci = TRUE, grid = TRUE,
            title = "Educational effects on household work by gender")+ my_theme # add the title

# 4.3 Scatter plot with text label
plot_scatter(lga, giniinc, sexoff, dot.labels = lga$name3)+ my_theme #This is too busy

plot_scatter(lga, giniinc, sexoff, dot.labels = lga$name) + my_theme

################################################################################
# 5. Correlation matrix
################################################################################
var.index <- c("astdomviol", "astnondomviol", "sexoff", "robbery", "popden", "medinc", "giniinc", "unemploy") # make a list of variables for correlation matrix

sjp.corr(lga[,var.index]) + my_theme

sjp.corr(lga[,var.index], show.legend = TRUE)+ my_theme # show the legend for colours

################################################################################
# 6. Regression
################################################################################
# data preparation
var.list <- c("fepresch", "famsuffr","rhhwork", "educyrs", "sex", "work", "topbot", "age", "marital", "id", "risei") # select variables.
mydata <- aus2012[, var.list] # make a dataset with the chosen variables
#------------
# In the next section the dplyr operator %>% is used. 
# For an introduction to the %>% operator see this:
# https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html 
#------------
mydata <- mydata %>%
            select(marital) %>%
            rec(rec = "1:2 = 1; 3:6 = 0") %>% # old value = new value; 3:6 means 3 to 6.
            add_columns(mydata) # recode 'marital' in a way that those currently married or in a partnership are coded as 1, otherwise as 0.
mydata <- mydata %>%
            select(marital_r) %>%
            to_factor() %>%
            add_columns(mydata) # convert the variable into a factor
mydata <- mydata %>%
            rename(currmarried = marital_r) # change the variable name into 'currmarried'.

#------------
mydata <- mydata %>%
            select(sex) %>%
            to_dummy() %>%
            add_columns(mydata) # make dummy variables for sex
mydata <- mydata %>%
            rename(male = sex_1, female = sex_2) # change the variable names
mydata <- mydata %>%
            select(male, female) %>%
            to_factor() %>%
            add_columns(mydata) # convert the variables into factors.

#------------
mydata <- mydata %>%
          select(work) %>%
          to_dummy() %>%
          add_columns(mydata) # make dummy variables for work
mydata <- mydata %>%
  rename(currwrk = work_1, pastwrk = work_2, nowrk = work_3) # change the variable names
mydata <- mydata %>%
  select(currwrk, pastwrk, nowrk) %>%
  to_factor() %>%
  add_columns(mydata) # convert the variables into factors.
#------------
mydata <- mydata %>%
          select(fepresch, famsuffr) %>%
          dicho(dich.by = 3) %>%
          add_columns(mydata) # 'disagree" and "strongly disagree" are collapsed into 1, otherwise 0
mydata <- mydata %>%
  select(fepresch_d, famsuffr_d) %>%
  to_factor() %>%
  add_columns(mydata) # convert the variables into factors.

#------------
mydata$female <- set_label(mydata$female, label = "Female")
mydata$female <- set_labels(mydata$female, labels = c("Male"= 0, "Female" = 1))
mydata$currmarried <- set_label(mydata$currmarried, label = "Currently married or in a partnership")
mydata$age <- set_label(mydata$age, label = "Age")
mydata$educyrs <- set_label(mydata$educyrs, label = "Years of schooling")
mydata$topbot <- set_label(mydata$topbot, label = "Class")
mydata$currwrk <- set_label(mydata$currwrk, label = "Currently working")
mydata$rhhwork <- set_label(mydata$rhhwork, label = "Hours spent on household working")
mydata$fepresch_d <- set_label(mydata$fepresch_d, label ="Attitude toward working mom: disagree with the statement that preschool child is likely to suffer.")
#------------

################################################################################
# 6.1 lm model - OLS regression model
mod1 <- lm(rhhwork ~ educyrs + age + topbot + female + currmarried + currwrk, data = mydata)
summary(mod1)

## 
## Call:
## lm(formula = rhhwork ~ educyrs + age + topbot + female + currmarried + 
##     currwrk, data = mydata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20.59  -6.36  -1.75   4.22  73.63 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.0120     2.0146    5.96  3.2e-09 ***
## educyrs       -0.0493     0.0775   -0.64     0.52    
## age            0.0212     0.0205    1.03     0.30    
## topbot        -0.1146     0.1702   -0.67     0.50    
## female1        4.9572     0.5581    8.88  < 2e-16 ***
## currmarried1   3.4371     0.5864    5.86  5.9e-09 ***
## currwrk1      -4.9496     0.6559   -7.55  8.6e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.69 on 1253 degrees of freedom
##   (352 observations deleted due to missingness)
## Multiple R-squared:  0.141,  Adjusted R-squared:  0.137 
## F-statistic: 34.3 on 6 and 1253 DF,  p-value: <2e-16

mod2 <- lm(rhhwork ~ age + educyrs * female, data = mydata)
summary(mod2)

## 
## Call:
## lm(formula = rhhwork ~ age + educyrs * female, data = mydata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19.88  -6.62  -2.25   4.04  76.89 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.4779     1.9227    2.85   0.0044 ** 
## age               0.1203     0.0175    6.90    8e-12 ***
## educyrs          -0.0929     0.1066   -0.87   0.3837    
## female1           6.3706     2.0144    3.16   0.0016 ** 
## educyrs:female1  -0.0788     0.1411   -0.56   0.5767    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10 on 1422 degrees of freedom
##   (185 observations deleted due to missingness)
## Multiple R-squared:  0.0917, Adjusted R-squared:  0.0891 
## F-statistic: 35.9 on 4 and 1422 DF,  p-value: <2e-16

# Forest plot (coefficient and its confidence intervals)
plot_model(mod1, type = "est") + my_theme

plot_model(mod1, type = "est", remove.estimates =  c("educyrs", "age", "topbot"))+ my_theme # remove estimates for 'educyrs", "age' and "topbot'

plot_model(mod1, type = "std") + my_theme # forest-plot with standardized coefficients

# Plot predicted values
plot_model(mod1, type = "pred", terms = "educyrs") + my_theme

plot_model(mod1, type = "pred", terms = "female") + my_theme

plot_model(mod1, type = "pred", terms = "female", geom.size = 1.1)+ my_theme # increase the width of the fitted line

# Plot interaction effect
plot_model(mod2, type = "int") + my_theme

################################################################################
# 6.2 glm model - logistic regression
################################################################################
# logit model
mod3 <- glm(fepresch_d ~ age + educyrs + female, data = mydata, family = "binomial")
summary(mod3)

## 
## Call:
## glm(formula = fepresch_d ~ age + educyrs + female, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.665  -1.141   0.819   1.119   1.670  
## 
## Coefficients:
##             Estimate Std. Error z value   Pr(>|z|)    
## (Intercept)  0.03572    0.32999    0.11     0.9138    
## age         -0.01753    0.00357   -4.91 0.00000089 ***
## educyrs      0.04750    0.01502    3.16     0.0016 ** 
## female1      0.45978    0.10924    4.21 0.00002565 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2001.6  on 1443  degrees of freedom
## Residual deviance: 1926.7  on 1440  degrees of freedom
##   (168 observations deleted due to missingness)
## AIC: 1935
## 
## Number of Fisher Scoring iterations: 4

mod4 <- glm(fepresch_d ~ age + educyrs * female, data = mydata, family = "binomial")
summary(mod4)

## 
## Call:
## glm(formula = fepresch_d ~ age + educyrs * female, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -1.66   -1.14    0.82    1.12    1.67  
## 
## Coefficients:
##                 Estimate Std. Error z value   Pr(>|z|)    
## (Intercept)      0.02600    0.38648    0.07      0.946    
## age             -0.01753    0.00357   -4.91 0.00000091 ***
## educyrs          0.04825    0.02155    2.24      0.025 *  
## female1          0.47891    0.41105    1.17      0.244    
## educyrs:female1 -0.00140    0.02889   -0.05      0.961    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2001.6  on 1443  degrees of freedom
## Residual deviance: 1926.7  on 1439  degrees of freedom
##   (168 observations deleted due to missingness)
## AIC: 1937
## 
## Number of Fisher Scoring iterations: 4

# Odds ratio plot
plot_model(mod3, type = "est", wrap.title = 100) + my_theme

# Plot predicted values
plot_model(mod3, type = "pred", terms = "educyrs") + my_theme

plot_model(mod3, type = "pred", terms = "age") + my_theme

plot_model(mod3, type = "pred", terms = "female") + my_theme

# Plot interaction effect
plot_model(mod4, type = "int") + my_theme

Last updated on 21 October, 2019 by Dr Nicholas Harrigan (nicholas.harrigan@mq.edu.au)