4 Data visualization

4.1 Data visualization introduction

4.1.1 Summarization

library(tidyverse)
library(dplyr)
mtcars %>%
  mutate( kml = mpg * 0.42) %>%
  group_by(cyl) %>%
   summarise(avg_US = mean(mpg), avg_metric = mean(kml))

## # A tibble: 3 × 3
##     cyl avg_US avg_metric
##   <dbl>  <dbl>      <dbl>
## 1     4   26.7      11.2 
## 2     6   19.7       8.29
## 3     8   15.1       6.34

mpg %>% 
  group_by(manufacturer, year) %>% 
  summarise_at(vars(cty, hwy), mean)

## # A tibble: 30 × 4
## # Groups:   manufacturer [15]
##    manufacturer  year   cty   hwy
##    <chr>        <int> <dbl> <dbl>
##  1 audi          1999  17.1  26.1
##  2 audi          2008  18.1  26.8
##  3 chevrolet     1999  15.1  21.6
##  4 chevrolet     2008  14.9  22.1
##  5 dodge         1999  13.4  18.4
##  6 dodge         2008  13.0  17.6
##  7 ford          1999  13.9  18.6
##  8 ford          2008  14.1  20.5
##  9 honda         1999  24.8  31.6
## 10 honda         2008  24    33.8
## # ℹ 20 more rows

change layout

mpg %>% count(class, year)%>% 
  spread(class, n)

## # A tibble: 2 × 8
##    year `2seater` compact midsize minivan pickup subcompact   suv
##   <int>     <int>   <int>   <int>   <int>  <int>      <int> <int>
## 1  1999         2      25      20       6     16         19    29
## 2  2008         3      22      21       5     17         16    33

change all characters into factors

mpg <- mpg %>% 
  mutate_if(is.character, as.factor) #if a column is a character, change to a factor

wide to long data

mpg1 <- mpg %>% 
  gather("key", "value", cty, hwy)

convert wide data to long data using pivot_longer

## Your code here. Naming choices for 1 and 2 are yours
dta <- mpg %>% 
  pivot_longer(cty:hwy, names_to = "var", values_to = "value") %>%  # Both of those are 
  # value label
   mutate(var = ifelse( var == 'cty', 'city','highway'))

ggplot(dta, aes(x = displ, y = value)) + 
  geom_point(aes(color = var))  +
  geom_smooth(aes(color = var), se = F)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

explore distribution

library(DataExplorer)
library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(naniar)

## 
## Attaching package: 'naniar'

## The following object is masked from 'package:skimr':
## 
##     n_complete

plot_histogram(riskfactors)

explore relationship/correlation

library(psych)
pairs.panels(riskfactors[,1:10])

create a individual theme

my_theme <- function(){
  theme_bw() + 
 
  theme(axis.title = element_text(size=16),
  axis.text = element_text(size=14),
  text = element_text(size = 14))
}

4.1.2 Explore missing values

# install.packages("naniar")
library(naniar)
# head(riskfactors)
riskfactors <- riskfactors

gg_miss_upset(riskfactors,nsets=10)

# install.packages("DataExplorer")

plot_missing(riskfactors)

# take a quick look at the data types of each column
visdat::vis_dat(riskfactors)

4.1.3 Add statistical test

library(ggpubr)
plt <- ggplot(  data=mpg,
  mapping= aes(x =  as.factor(year),
               y = cty,
               color = as.factor(year) ) )+
  geom_boxplot() +
  geom_jitter(width=0.1)+
  labs(x = 'Year',
       y = "City mpg") +
  my_theme()+ facet_wrap( ~ manufacturer,nrow = 2)  
  
# add statistical test
  my_comparisons <- list(c('1999','2008'))
plt + stat_compare_means() +
stat_compare_means(comparisons = my_comparisons)

4.1.4 Add texts to dots

USArrests <- USArrests %>% rownames_to_column('State')

ggplot(USArrests, aes(
x=UrbanPop,y=Murder))+
geom_point() +
  labs(x = "Percent of population that is urban", 
       y = "Murder arrests (per 100,000)",
       caption =  "McNeil (1997). Interactive Data Analysis")+
  geom_text(aes(label=State),size=3)

4.1.5 Set the legend

ggplot(iris, aes(x=  Sepal.Length , fill= as.factor( Species)) ) +  #whole plot's option
  
  geom_histogram(aes(y=..density..),alpha=0.5, position="identity" , bins = 50)+
  geom_density(aes(linetype=as.factor(Species)),alpha=.1 )+    #aesthetic's option
  
  scale_fill_manual( name = "Groups",values = c("grey", "black", "skyblue"),labels = c("setosa", "versicolor" , "virginica" ))+  
  scale_linetype_manual( name = "Groups" ,values = c(1,3,5),labels = c("setosa", "versicolor" , "virginica") )+       # common legend 
  
  labs(x = "Sepal.Length", 
       y = "Density", 
       title = "")

4.1.6 Create a panel of plots

combine multiple plots into one

p1=ggplot(data=riskfactors,aes(x=age))+
  geom_histogram(bins = 30 )

p2=ggplot(data=riskfactors,aes(x=sex))+
  geom_bar (aes(x=sex) )

p3=ggplot(riskfactors,aes(x = education, y = bmi))+
  geom_boxplot (  )

p4=ggplot(riskfactors, aes(x = marital )) + 
  geom_bar(aes(group = education, y = (..count..)/sum(..count..),fill = education)) + 
          scale_y_continuous(labels=scales::percent) 

# install.packages("ggpubr")
library(ggpubr)
ggarrange(p1, p2, p3, p4, ncol = 2, nrow=2)

4.1.7 Plots in regression

create linear regression model

data("Boston", package = "MASS")
linear_reg <- glm(medv ~ ., data=Boston , family = gaussian())
summary(linear_reg)

## 
## Call:
## glm(formula = medv ~ ., family = gaussian(), data = Boston)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -15.595   -2.730   -0.518    1.777   26.199  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 22.51785)
## 
##     Null deviance: 42716  on 505  degrees of freedom
## Residual deviance: 11079  on 492  degrees of freedom
## AIC: 3027.6
## 
## Number of Fisher Scoring iterations: 2

summary

knitr::kable(broom::tidy(linear_reg))

term	estimate	std.error	statistic	p.value
(Intercept)	36.4594884	5.1034588	7.1440742	0.0000000
crim	-0.1080114	0.0328650	-3.2865169	0.0010868
zn	0.0464205	0.0137275	3.3815763	0.0007781
indus	0.0205586	0.0614957	0.3343100	0.7382881
chas	2.6867338	0.8615798	3.1183809	0.0019250
nox	-17.7666112	3.8197437	-4.6512574	0.0000042
rm	3.8098652	0.4179253	9.1161402	0.0000000
age	0.0006922	0.0132098	0.0524024	0.9582293
dis	-1.4755668	0.1994547	-7.3980036	0.0000000
rad	0.3060495	0.0663464	4.6128998	0.0000051
tax	-0.0123346	0.0037605	-3.2800091	0.0011116
ptratio	-0.9527472	0.1308268	-7.2825106	0.0000000
black	0.0093117	0.0026860	3.4667926	0.0005729
lstat	-0.5247584	0.0507153	-10.3471458	0.0000000

create logistical regression

# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes) 
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes

logistic_reg <- glm(diabetes ~ ., data=diabetes, family = binomial)

summary(logistic_reg)

## 
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = diabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5

summary

knitr::kable(broom::tidy(logistic_reg))

term	estimate	std.error	statistic	p.value
(Intercept)	-8.4046964	0.7166359	-11.7279870	0.0000000
pregnant	0.1231823	0.0320776	3.8401403	0.0001230
glucose	0.0351637	0.0037087	9.4813935	0.0000000
pressure	-0.0132955	0.0052336	-2.5404160	0.0110721
triceps	0.0006190	0.0068994	0.0897131	0.9285152
insulin	-0.0011917	0.0009012	-1.3223094	0.1860652
mass	0.0897010	0.0150876	5.9453340	0.0000000
pedigree	0.9451797	0.2991475	3.1595780	0.0015800
age	0.0148690	0.0093348	1.5928584	0.1111920

4.1.7.1 Create forest plots for `coefficients` or `OR`

library(sjPlot)

## #refugeeswelcome

plot_model(linear_reg, show.values = TRUE, value.offset = 0.5)

plot_model(logistic_reg, show.values = TRUE, value.offset = .5, vline.color = "black")

another way

library(finalfit) 

explanatory = c(  "crim"  ,  "zn"   ,   "indus"  ,   "nox"   ,  "rm"   ,   "age" ,    "dis"  ,   "rad"   ,  "tax"   ,"ptratio" ,"black"  , "lstat" )

dependent = "medv"


Boston %>%
 coefficient_plot(dependent, explanatory, table_text_size=3, 
                  title_text_size=12,
                   plot_opts=list(xlab("Beta, 95% CI"), 
                                  theme(axis.title = element_text(size=12))))

library(finalfit) 

explanatory = c(  "pregnant", "glucose" , "pressure", "triceps"  ,"insulin" , "mass"   ,  "pedigree", "age"  )

dependent = "diabetes"

diabetes %>%
 or_plot(dependent, explanatory, table_text_size=3, 
                  title_text_size=12,
                   plot_opts=list(xlab("OR, 95% CI"), 
                                  theme(axis.title = element_text(size=12))))

## Waiting for profiling to be done...
## Waiting for profiling to be done...
## Waiting for profiling to be done...

qq plot

ggqqplot( (Boston$medv))

Loading data set

library(printr)

## Registered S3 method overwritten by 'printr':
##   method                from     
##   knit_print.data.frame rmarkdown

library(tidyverse)
head(iris)

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
5.1	3.5	1.4	0.2	setosa
4.9	3.0	1.4	0.2	setosa
4.7	3.2	1.3	0.2	setosa
4.6	3.1	1.5	0.2	setosa
5.0	3.6	1.4	0.2	setosa
5.4	3.9	1.7	0.4	setosa

4.2 Scatter plot

4.2.1 Create a empty canvas

then create aesthetic mapping
tell the function which dataset and variables to use

ggplot(data = iris,        # which data set? canvas? 
       aes(x=Sepal.Length , y=Petal.Length   ))  # which variables as aesthetics? x and y are mapped to columns of the data; different geoms can have different aesthetics (different variables).

4.2.2 Add a layer/geom of `points` to the canvas

ggplot(data = iris,
       mapping = aes(x=Sepal.Length , y=Petal.Length  )) + 
  geom_point()   # adding the geometrical representation

# same plot as above
ggplot(data = iris) + 
  geom_point(  aes(x=Sepal.Length , y=Petal.Length  ))

4.2.3 Add another aesthetic

add a curve/straight line to fit these points
geom provides the aesthetic to ggplot

# Loess curve
ggplot(data = iris, 
       mapping = aes(x=Sepal.Length , y=Petal.Length  )) + 
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Linear regression line
ggplot(data = iris, 
       mapping = aes(x=Sepal.Length , y=Petal.Length)) + 
  geom_point() +
  geom_smooth(method = "lm")

## `geom_smooth()` using formula = 'y ~ x'

4.2.4 Add other aesthetic

set other aesthetics colour, alpha (transparency), and size of points

ggplot(data = iris) + 
  geom_point(aes(x=Sepal.Length , y=Petal.Length, size = Sepal.Width     ), 
             alpha = .5, 
             colour = "red")

ggplot(data = iris) + 
  geom_point(aes(x=Sepal.Length , y=Petal.Length, size = Sepal.Width , colour=Species), #white is a variable here 
             alpha=.9)

categorize Petal.Width then map colour to this new variable

iris <- iris %>% 
  mutate(growth = ifelse(Petal.Width   > 1.5, "Wide", "Normal"))

ggplot(data=iris) + 
  geom_point(aes(x=Sepal.Length , y=Petal.Length, size = Sepal.Width , colour=growth), 
             alpha=.9)

4.3 Bar chart

ggplot(data = iris) + 
  geom_bar(aes(x = growth))

bar chart after group_by
then use stat='identity'

library(dplyr)
results  <- iris %>% 
  group_by(Species, growth) %>% 
   summarise(Sepal.Length.mean=mean (Sepal.Length ))

## `summarise()` has grouped output by 'Species'. You can override using the
## `.groups` argument.

 gop <- results  %>% 
  filter(Species != "setosa_null"  )
 gop

## # A tibble: 5 × 3
## # Groups:   Species [3]
##   Species    growth Sepal.Length.mean
##   <fct>      <chr>              <dbl>
## 1 setosa     Normal              5.01
## 2 versicolor Normal              5.91
## 3 versicolor Wide                6.18
## 4 virginica  Normal              6.13
## 5 virginica  Wide                6.62

though meaningless below until line chart (just use the mean as the sum for demonstration)

# We can also store parts of a plot in an object
plot1 <- ggplot(gop) + 
  geom_bar(aes(x=growth , y=Sepal.Length.mean), 
           
           stat='identity')
plot1

### Add some options for the whole ggplot rather than layers - switch the x and y axes

plot1 + 
  coord_flip()

reorder x categories (-means descending)

ggplot( gop) + 
  geom_bar(aes(x=reorder(growth, -Sepal.Length.mean), y=Sepal.Length.mean, fill=growth), 
           stat='identity') + 
  coord_flip()

add x axis label and a theme

ggplot(gop) + 
  geom_bar(aes(x=reorder(growth, -Sepal.Length.mean), y=Sepal.Length.mean, fill=growth), 
           stat='identity') + 
  coord_flip() + 
  xlab("Growth categories") + 
  guides(fill=F) +
  theme_minimal()

set theme

library(ggthemes)
ggplot(data = iris) + 
  geom_bar(aes(x = growth)) + 
  theme_economist()

4.3.1 Grouped bar chart

-bar chart with different panels

ggplot(mpg, aes(x = class)) + 
  geom_bar() + 
  facet_wrap( ~ year)

actual number (groups are stacked by default)

ggplot(gop) + 
  geom_bar(aes(x=growth, y=Sepal.Length.mean, fill=Species)
           , stat='identity'
           )

ggplot(mpg, aes(x = class )) + 
  geom_bar(aes(group = year, fill = year), position = "stack")

percentage

ggplot(gop) + 
  geom_bar(aes(x=growth, y=Sepal.Length.mean, fill=Species), 
           stat='identity', 
           position='fill')

groups are dodge with actual number

ggplot(gop) + 
  geom_bar(aes(x=growth, y=Sepal.Length.mean, fill=Species), 
           stat='identity', 
           position='dodge')

- groups are dodge with percentage

gop2 <- gop %>% 
  group_by(growth ) %>% 
  mutate(Sepal.Length.prop=Sepal.Length.mean/sum(Sepal.Length.mean))

ggplot(gop2) + 
  geom_bar(aes(x=growth, y=Sepal.Length.prop, fill=Species), 
           stat='identity', 
           position='dodge') + 
  ylab("Votes (%)")

4.4 Line charts

ggplot(iris) + 
  geom_line(aes(x=Sepal.Length , y=Petal.Length))

4.4.1 Grouped by `colour variable`

ggplot(iris) + 
  geom_line(aes(x=Sepal.Length , y=Petal.Length, colour = Species))

grouped by state then set how many rows or columns

ggplot(iris) + 
  geom_line(aes(x=Sepal.Length , y=Petal.Length) ) + 
  
  facet_wrap(~Species, nrow = 1) +      #set how many rows
  coord_flip()

4.4.2 Multiple aesthetics

iris <- iris %>% 
  mutate(growth = ifelse(Petal.Width   > 1.5, "Wide", "Normal"))

ggplot(iris, aes(x = Sepal.Length, y = Petal.Length)) + 
  geom_line(size=2,color="purple")+  
  # number format
  scale_x_log10(labels = scales::label_number())+
  geom_point( aes(size =  Sepal.Length,colour = as.factor(growth)),show.legend = F)+
  facet_wrap(~ Species)

4.5 ggplot2 parameters

For detail, please read this article and this one.

library(datasauRus)
  ggplot( )+
    geom_point(data=datasaurus_dozen[datasaurus_dozen$dataset=="dino",], aes(x = x, y = y),color= "#7CAE00" ) +
    theme_void()+
    theme(legend.position = "none")

4.5.1 Components of plot

Components of plot
- Data: is a data frame
- Aesthetics: is used to indicate x and y variables and to control the color, the size or the shape …
- Geometry: the type of graphics (bar plot, line plot, scatter plot…)
adjust parameters
- adjust legend using guide_
- adjust color, size, and shape using scale_, guide_ can further adjust scale_
- adjust panel, background, axis (font, color, size, angle), title, legend (position), caption using theme
types of plots
- geom_boxplot(): Box plot
- geom_violin(): Violin plot
- geom_dotplot(): Dot plot
- geom_jitter(): Jitter charts
- geom_line(): Line plot
- geom_bar(): Bar plot
- geom_errorbar(): Error bars
- geom_point(): Scatter plot
- geom_smooth(): Add smoothed curve
- geom_quantile(): Add quantile lines
- geom_text(): Textual annotations
- geom_density(): Create a smooth density curve
- geom_histogram(): Histogram

example

data("faithful")
# Basic scatterplot
ggplot(data = faithful, 
       mapping = aes(x = eruptions, y = waiting)) + 
  geom_point()+
  stat_density_2d(aes(fill = ..level..), geom="polygon")

# Data and mapping can be given both as global (in ggplot()) or per layer
# ggplot() + 
#   geom_point(mapping = aes(x = eruptions, y = waiting),
#              data = faithful)

4.5.2 Create main title, axis labels, caption

pay attention whether argument is factor or continuous.

ggplot(data = faithful, 
       mapping = aes(x = eruptions, y = waiting)) + 
  geom_point()+
  
  labs(title = "Number of xxx",
       caption = "source: http://xxx",
       x = "Eruptions" ,
       y = "Waiting time" ) +
# customize title, axis, caption  
  theme(
plot.title = element_text(color="red", size=14, face="bold.italic"),
plot.caption = element_text(color="red", size=10, face="italic"),
axis.title.x = element_text(color="blue", size=14, face="bold"),
axis.title.y = element_text(color="#993333", size=14, face="bold")
)+
  
# hide main title  
theme(plot.title = element_blank() )

4.5.3 Create legend title, position

p <- ggplot(faithful) + 
  geom_point(aes(x = eruptions, y = waiting, colour = eruptions < 3))+
  labs(color="Numbers of eruptions") +
  
  theme(legend.position = "left")+
# Change the appearance of legend title and labels
  theme(legend.title = element_text(colour="blue"),
          legend.text = element_text(colour="red"))+
# Change legend box background color
  theme(legend.background = element_rect(fill=NULL))
print(p)

customize legends using scale functions

# how to change order of legend?  
# Set legend title and labels
p+ scale_color_discrete(name = "Numbers of eruptions change", labels = c("F", "T" ))

customize legend

guide_colorbar(): continuous colors

guide_legend(): discrete values (shapes, colors)

ggplot(faithful) + 
  geom_point(aes(x = eruptions, y = waiting, size = eruptions < 3))+
  guides(size = guide_legend(
    # legend title
    title = "title is too low",
    title.position = "bottom",
    title.vjust = -5,
    # legend label
    label.position = "left",
    label.hjust = 1,
    label.theme = element_text(size = 15,
                               face = "italic",
                               colour = "red",
                               angle = 0),
    # label reverse
    reverse = TRUE,
    
    # width of bin
    keywidth = 1,
    
    ncol = 4
    
   )
  )

delete a legend

  ggplot(mpg, aes(x = displ, y = hwy, color = class, size = cyl)) +
  geom_point() +
  guides(
    color = guide_legend("type"),      # keep
    size = "none"                      # remove
  )

combine two legends when they use the same variable (mapping)

  ggplot(mpg, aes(x = displ, y = hwy, color = cyl, size = cyl)) +
  geom_point() +
  scale_color_viridis_c() +
  
  guides(
    color = guide_legend("title"),
    size = guide_legend("title")
  )

# guide = "legend"

4.5.4 Change plot colors

set color into aes()

ggplot(faithful) + 
  geom_point(aes(x = eruptions, y = waiting, colour = eruptions < 3))

set color outside of aes()

ggplot(faithful) + 
  geom_point(aes(x = eruptions, y = waiting),
             colour = 'steelblue')

Colour the histogram with color and fill

ggplot(faithful) + 
  geom_histogram(aes(x = eruptions,color=eruptions < 3, fill=eruptions < 4))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Colour the histogram by waiting and changing position

ggplot(faithful) + 
  geom_histogram(aes(x = eruptions,color=waiting>60), position = 'dodge')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(faithful) + 
  geom_histogram(aes(x = eruptions,color=waiting>60), position = 'identity')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

For fill and stack position, please see position section.

change colors manually using scale

ggplot(faithful) + 
  geom_histogram(aes(x = eruptions,color=eruptions < 3, fill=eruptions < 4))+
  # Box plot
  scale_fill_manual(values=c("#999999", "#E69F00"))

# Scatter plot
  scale_color_manual(values=c( "#E69F00", "#56B4E9"))

## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

using scale brewer automatically

ggplot(faithful) + 
  geom_histogram(aes(x = eruptions,color=eruptions < 3, fill=eruptions < 4))+
  # Box plot
  scale_fill_brewer(palette="Dark2")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Scatter plot
  scale_color_brewer(palette="Set1")

## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     scale_name: brewer
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

  # using guide to change the color of legend key

using gray colors using scale

# 
p + scale_fill_grey()  #no fill element

# 
p + scale_color_grey()

Gradient or continuous colors (can set the middle point aswhite)

# Color by cty values
sp2<-ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = cty))
sp2

# Change the low and high colors
# Sequential color scheme
sp2+scale_color_gradient(low="blue", high="red")

# Diverging color scheme
mid<-mean(mpg$cty)
sp2+scale_color_gradient2(midpoint=mid, low="blue", mid="white",
                          high="red"  )

4.5.5 Change points shapes, transparent and size

make the points larger and slightly transparent.

ggplot(faithful) + 
  geom_point(aes(x = eruptions, y = waiting, shape= eruptions < 3, size=eruptions), color="steelblue", alpha=0.5)

# hwo to reverse order of legend size 
ggplot(faithful) + 
  geom_point(aes(x = eruptions, y = waiting, shape= eruptions < 3, size=eruptions), color="steelblue", alpha=0.5)+

  scale_shape_manual(values=c(10, 23 ))+
 
  theme(legend.position="top")

4.5.6 Change bars position

p <- ggplot(mpg, aes(fl, fill = drv))

p1 <- p + geom_bar ()

p2 <- p + geom_bar(position = "dodge")
 
p3 <-p + geom_bar(position = "fill")
 
p4 <-p + geom_bar(position = "stack")

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(p1, p2, p3,p4, ncol=2)

4.5.7 Add text annotations

ggplot(data=mpg[(1:100), ], aes(x = displ, y = hwy)) +
  geom_point(aes(color = cty))+
  
  geom_text(aes(label = manufacturer ),
               size = 2, vjust = -1)  #vjust is site not direction

4.5.8 Add a line that (separates points)

ggplot(faithful) + 
  geom_point(aes(x = eruptions, y = waiting))+
  
  geom_abline(slope=-13,intercept = 100,color="red", linetype = "dashed")+
  # Add horizontal line at y = 2O; change line type and color
  geom_hline(yintercept=20, linetype="dotted", color = "red")+
# Add vertical line at x = 3; change line type, color and size
  geom_vline(xintercept = 3, color = "blue", size=1.5)

# Add regression line

add segment and arrow

ggplot(mpg, aes(x = displ, y = hwy )) +
  geom_point() +
# Add horizontal line segment
  geom_segment(aes(x = 2, y = 15, xend = 3, yend = 15, size=3, color="red")) +
  geom_segment(aes(x = 3, y = 33, xend = 2.5 , yend = 30),
                  arrow = arrow(length = unit(0.5, "cm")))

fitted curve

ggplot(data=mpg[mpg$fl!="c",], aes(x = displ, y = hwy)) +
  geom_point( ) +
  geom_smooth(color="red") #fitted curve

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=mpg[mpg$fl!="c",], aes(x = displ, y = hwy)) +
  geom_point( ) + 
  geom_quantile() +
  geom_rug()+
  theme_minimal()

## Smoothing formula not specified. Using: y ~ x

customize line

ggplot(mpg, aes(x = displ, y = hwy, color = fl, linetype = fl, size=fl)) +
  geom_point() +
  geom_line(aes( ) )+
  labs(color="What is fl")+
  
# customize linetype, color, size  
  scale_linetype_manual(values=c("twodash", "dotted","twodash", "dotted","twodash"))+
  scale_color_manual(name = "continents",breaks = c("c", "d", "e", "Europe", "Oceania"),labels = c("africa", "americas", "asia", "europe", "oceania"),  values=c('#999999','#E69F00','#999999','#E69F00','#999999')   )+ #using breaks define three labels   
  scale_size_manual(values=seq(1,4, 0.2))+
  theme(legend.position="top") +
  
  guides(color="legend")

4.5.9 Using scale_ function

all mappings have associated scales even if not specified.

uisng scale_colour_brewer.

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour = class)) + 
  scale_colour_brewer(type = 'qual')

RColorBrewer

RColorBrewer::display.brewer.all()

using different palettes

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour = class)) + 
  scale_colour_brewer (palette = 'Paired')

showing cyl with size

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour =  class, size=cyl)) + 
  scale_colour_brewer(palette  = 'Set1' ) +
  scale_size (breaks = c(4,6))

using guides to modify the scale_

 ggplot(mpg) +
   geom_point(aes(x = displ, y = hwy, colour = cyl, size=cyl)) +
   # scale_colour_brewer(palette  = 'Set1') +   #can not continuous 
   scale_size (breaks = c(4,5,6))  +
  
   guides( size = guide_legend(
            override.aes = list(color = c('red', 'blue', 'black'))))

unite legends

when multiple aesthetics are mapped to the same variable.

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour = cyl, size=cyl))   +
  guides(colour="legend")

category is also ok

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour = fl, size=fl))   +
  guides(colour="legend") #size is not ok

x and y also have associated scales

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  scale_x_continuous(breaks = c(3.5, 5, 6)) + 
  scale_y_continuous(trans = 'log2')

4.5.10 Change coordinates

Changing the coordinate system can have dramatic effects

coord_polar

ggplot(mpg) + 
  geom_bar(aes(x = class)) + 
  coord_polar()

ggplot(mpg) + 
  geom_bar(aes(x = class)) + 
  coord_polar(theta = 'y') + 
  expand_limits(y = 70)

specify the scale of coordinate

require(scales)

## Loading required package: scales

## 
## Attaching package: 'scales'

## The following objects are masked from 'package:psych':
## 
##     alpha, rescale

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

ggplot(mpg) + 
  geom_bar(aes(x = class)) + 
  scale_y_continuous(limits = c(0, 50), breaks = seq(0, 50, 01))

  # scale_y_continuous(labels = percent) # labels as percents
  # +
  # scale_x_discrete(labels=c(1:7)  )

using coord_cartesian zoom in

# have been deleted 
ggplot(mpg) + 
  geom_bar(aes(x = class)) + 
  scale_y_continuous(limits = c(0, 30))+
  scale_x_discrete( limit=c("midsize","compact"))

ggplot(mpg) + 
  geom_bar(aes(x = class)) + 
  coord_cartesian( ylim = c(0, 30))+
  scale_x_discrete( limit=c("midsize","compact"))

reverse direction of axes

ggplot(mpg) + 
  geom_point(aes(x = hwy, y = displ))+
  
  scale_x_continuous(breaks = c(20, 30, 35,40)) + 
  scale_y_reverse()+
  scale_y_continuous(trans="log2")

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

# log10, sqrt, reverse, scale_y_continuous(trans="log2")

4.5.11 Customize axis ticks

change axis text font, color, size, angle using theme

# when use theme, scale, guide? 
ggplot(mpg) + 
  geom_point(aes(x = hwy, y = displ))+
  
  theme(axis.text.x = element_text(face="bold", color="#993333", 
                           size=14, angle=45),
          axis.text.y = element_text(face="bold", color="blue", 
                           size=7, angle=90))

remove aixs ticks and tick labels

ggplot(mpg) + 
  geom_point(aes(x = hwy, y = displ))+
theme(
  axis.text.x = element_blank(), # Remove x axis tick labels
  axis.text.y = element_blank(), # Remove y axis tick labels
  axis.ticks = element_blank()) # Remove ticks

4.5.12 Flip and reverse plot

boxplot and violin

ggplot(mpg ) + 
  geom_violin( aes(x = as.factor(cyl), y=hwy ,color=as.factor(cyl) ) ,trim = FALSE,width = 4 ) +
  geom_boxplot( aes(x = as.factor(cyl), y=hwy  ), notch = F , width = 0.1)

dotplot using stat_ function

ggplot(mpg ,aes(x = as.factor(cyl), y=hwy ) )+ 
  geom_dotplot(aes(color =as.factor(cyl), fill = as.factor(cyl)), 
                 binaxis = "y", stackdir = "center") + 
  stat_summary(fun.data="mean_sdl" )

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

errorbar

df3 <- data_summary(mpg, varname="hwy", grps= c("cyl" ))

## Loading required package: plyr

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:ggpubr':
## 
##     mutate

## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

head(df3)

cyl	hwy	sd
4	28.80247	4.515030
5	28.75000	0.500000
6	22.82278	3.685590
8	17.62857	3.262307

ggplot( df3, aes(as.factor(cyl) ,  (hwy), ymin = hwy-sd, ymax = hwy+sd)  ) +
  geom_line(aes(group = 0 )) +
  geom_errorbar(aes(color = as.factor(cyl) ),width = 0.2)

ggplot( df3, aes(as.factor(cyl) ,  (hwy), ymin = hwy-sd, ymax = hwy+sd)  ) +
  geom_bar(aes(fill = as.factor(cyl)), stat = "identity",
             position = "dodge") +
  geom_errorbar(aes(   ),width = 0.2) +
  coord_flip()

using original data

  ggplot(mpg, aes(cyl, hwy)) +
  stat_summary(geom = "bar") +
  stat_summary(geom = "errorbar")

## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`

flip

ggplot(mpg) + 
  geom_bar(aes(x = class)) +
  coord_flip()

reverse

ggplot(mpg) + 
  geom_bar(aes(x = class)) +
  scale_y_reverse()

4.5.13 Create stats

Every geom has a stat. The stat can be overwritten if we use any additional computations.

ggplot(mpg ) + 
  geom_bar (aes(x = cyl ),position = 'identity' ) #using original data

using transformed variables

library(dplyr)
library(ggplot2)
# mpg_counted <-    
#   count(mpg,  cyl  )
# head(mpg_counted)

# ggplot(mpg_counted) + 
#   geom_smooth(aes(x =  cyl , y = n)) +
#   geom_bar (aes(x =  cyl , y = n), stat = 'identity') #using summary data

using the after_stat() function inside aes().

require(scales)
ggplot(mpg) + 
  geom_bar(aes(x = class, y = after_stat(  count / sum(count))))+
  scale_y_continuous(labels = percent) # labels decimals as percents

using density geometric in histogram

ggplot(mpg,aes(x = hwy)) + 
  geom_histogram(aes(y=..density..))+
  geom_density(  )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Use stat_summary() to add the mean of hwy for each group

STAT vs. GEOM

p1 <- ggplot(mpg,aes(x = class, y = hwy) ) + 
  stat_summary(
    geom = "pointrange",
    fun.data = mean_se
  )

p2 <- ggplot(mpg,aes(x = class, y = hwy) ) + 
  stat_summary( 
  )

p3 <- ggplot(mpg,aes(x = class, y = hwy) ) + 
  stat_summary( 
  )+
  stat_summary(
    fun.data = ~mean_se(., mult = 1.96), # Increase `mult` value for bigger interval!
    geom = "errorbar",
  )

library(patchwork)
p1+p2+p3

## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`

ggplot(mpg) + 
  geom_point(aes(x = class, y = hwy), width = 0.2)+
  stat_summary(aes( x = class,y = hwy), geom="point",color="red",size=4)

## No summary function supplied, defaulting to `mean_se()`

jitter points

ggplot(mpg) + 
  geom_jitter(aes(x = class, y = hwy), width = 0.2)+
  stat_summary(aes( x = class,y = hwy), geom="point",color="red",size=4)

## No summary function supplied, defaulting to `mean_se()`

4.5.14 Facets

facet_wrap() allows you to place facet side by side into a rectangular layout. facet_grid() allows you to specify different directions and works on two variables.

share the axes between the different panels

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_wrap(~ class)

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_grid(drv~ .  )

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_grid(~ drv  )

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_grid(year ~ drv)

do not share the axes between the different panels

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_wrap(~ drv , scales = "free")

only free y axes

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_wrap(~ drv , scales = "free_y")

adjust y scale (space) between the panels

ggplot(mpg) + 
  geom_bar(aes(y = manufacturer)) + 
  facet_grid(class ~ .)

ggplot(mpg) + 
  geom_bar(aes(y = manufacturer)) + 
  facet_grid(class ~ ., space = "free_y", scales = "free_y")

display by adding multiple variables together

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_wrap(year ~ drv)

4.5.15 Theme

theme_minimal

ggplot(mpg) + 
  geom_bar(aes(y = class)) + 
  facet_wrap(~year) + 
  theme_minimal()

Further adjustments
theme_bw

ggplot(mpg) + 
  geom_bar(aes(y = class)) + 
  facet_wrap(~year) + 
  
  labs(title = "Number of car models per class",
       caption = "source: http://fueleconomy.gov",
       x = NULL,
       y = NULL) +
   
  theme_bw() + 
  theme(
 
    strip.text = element_text(face = 'bold', hjust = 0),
    
    plot.caption = element_text(face = 'italic'),
    
    panel.grid.major = element_line('white', size = 0.5),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank()
    # ,
    # panel.ontop = TRUE
  )

theme_classic

ggplot(mpg) + 
  geom_bar(aes(y = class, fill = drv)  ,position = "dodge") + 
  facet_wrap(~year) + 
  labs(title = "Number of car models per class",
       caption = "source: http://fueleconomy.gov",
       x = 'Number of cars',
       y = NULL)+
  
  scale_fill_manual(name = "Drive Models",values=c("black", "grey50",  "grey80"), labels = c("4w","Fw","Rw" )) +
  
  # scale_x_continuous(expand = c(0, NA)) + 
  
  theme_classic() + 
  theme(
    # text = element_text('Avenir Next Condensed'),
    # strip.text = element_text(face = 'bold', hjust = 0),
    
    plot.caption = element_text(face = 'italic'),
    
    panel.grid.major = element_line('white', size = 0.5),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank()
    
    # panel.ontop = TRUE)
  )

using ggthemes

library(ggthemes)
ggplot(mpg) + 
  geom_bar(aes(y = class, fill = drv)  ,position = "dodge") + 
  facet_wrap(~year) + 
  labs(title = "Number of car models per class",
       caption = "source: http://fueleconomy.gov",
       x = 'Number of cars',
       y = NULL)+
  
  ggthemes::theme_economist()

customized theme

ggplot(mpg) + 
  geom_bar(aes(y = class, fill = drv)  ,position = "dodge") + 
  facet_wrap(~year) + 
  labs(title = "Number of car models per class",
       caption = "source: http://fueleconomy.gov",
       x = 'Number of cars',
       y = NULL)+
  
theme(
  
    # 1 change legend
    legend.background = element_rect(
      fill = "#fff6c2",
      color = "black",
      linetype = "dashed"
    ),
    legend.key = element_rect(fill = "grey", color = "brown"),
    legend.position = "bottom", 
    
    # 2 change panel (middle erea) background 
    panel.background = element_rect(
      fill = "#005F59",
      color = "red", size = 3
    ),
    panel.border = element_rect(
      color = "black",
      fill = "transparent",
      linetype = "dashed", size = 3
    ),
    
    # 3 change plot background
    plot.background = element_rect(
      fill = "#a1dce9",
      color = "black",
      size = 1.3
    ),
    
   
    
    # 4 change axis elements
    axis.line = element_line(color = "orange", size = 2),
    axis.title = element_text(color = "red", face = "italic"),
    axis.ticks = element_line(color = "purple", size = 3),
    axis.text = element_text(color = "blue"),
    axis.text.x = element_text(angle = 45, hjust = 1), 
    
    # 5 change facet panel
    strip.background = element_rect(fill = "orange"),
    strip.text = element_text(color = "red"),
    panel.spacing = unit(0.3, "inch") 
  )

4.5.16 How to setup subscripts or superscripts

see here

4.6 How to create advanced plots