Table of Contents
R Tutorial – R Basic Syntax R Overview
R is a free software programming language and software environment for statistical computing and graphics. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.
What R can do ?
Using R Programming Language one can do
- linear and non-linear modelling
- statistical tests
- Time-series analysis
- Classification
- Clustering
Why R ?
Why to use R Programming Language and the answer is because
- R is Free.
- It Provides a powerful way to do statistical analysis on large sets of data.
- New functions and packages are created and updated consistently.
- It has Strong user base.
R Tutorial
# to load a file
library(dslabs) # dslabs is a package
data(file_name)
# imports file_name in R
str(file_name) # gives the structure of imported/loaded data set
# combination or concat
code <- c(380, 124, 818)
code
country <- c('italy', 'canada', 'egypt')
country
# names gives the column names of a data set
names(code) <- country # assigning names to code list
code
# basic function
class(country)
length(country)
# numeric vs integer
num1 <- 3
num2 <- 3L
class(num1)
class(num2)
# sequence generation
li1 <- seq(10,20,2)
li2 <- 20:10
li1
li2
# subsetting and slicing
code[2]
country[c(1,3)]
country[2:3]
# type casting and chage to numeric or integer using as.numeric
x <- c('1','2', 10)
x
class(x)
as.numeric(x)
x
# sorting, arranges in ascending order
temp <- c(2,50,56,87,101)
sort(temp)
temp
# order, returns indexes of the ordered elements
ord <- c(34,56,38,887,101)
index <- order(ord)
index
class(index)
ord
ord[index]
# max and whichmax, whichmax gives the index of greatest, SAME FOR MIN
max(ord)
which.max(ord)
ord[which.max(ord)]
# rank, gives rank from smalles to greatest
rank(ord)
# create dataframe
state <- c('UP','Gujarat','Bihar','J&K')
pop <- c(10,20,NA,39)
df1 <- data.frame(states = state, population = pop)
df1
# check for NA values i.e. null values
ind <- is.na(df1) # gives a logical vector i.e. a boolean df
ind
sum(ind) # counts the number of NA values by summing TRUE in ind as TRUE=1
mean(df1[!ind]) # gives mean of values in df1 which are not NA
# logical operators
# ind1 <- df$column_name <= 7 # gives a boolean df or vector similar to is.na
# df$column_name1[ind1] # gives the values from dataframe satisfying condition
# WHICH, MATCH and %IN% functions, they all give out the index numbers of the elements satisfying condition
index <- which(df$column == "Brad")
df[index]
index <- match(c("NY","Florida","Texas"), df$column)
df[index]
x<- c("a","b","c","d")
y<-c("a","b","e")
y %in% x #gives a boolean output
c("boston","dakota","washington") %in% df$column_name # checks if three items are in column_name or not and returns boolean
ind <- which(!abbs %in% murders$abb)
# manipulating data tables and advanced analysis can be done using the 'DPLYR' package(for working with tables)!
df1 <- mutate(df1,col4=........) # adds column col4 to df1. mutate is used to add columns
head(df1) # prints first 6 rows of df1
filter(df1, rate <=0.7) # filter prints rows satisfying condition given from df1
new_table <- select(df1, col1,col2) # only selects col1 and col2 from df1 and makes new data frame as new_table
df1 %>% select(col2, col3, col4) %>% filter(col4 <= 0.7) # pipe operator can combine diffrent conditions into one
filter(murders, rate < 1 & (region == 'Northeast' | region == 'West')) %>% select(state, rate, rank)
# creating data frame
grades = data.frame(name = c("aug","july","june"),
exam = c(95, 96, 97),
exam2 = c(10, 20, 30),
stringsasfactors = FALSE) # this makes columns type as character
class(gades$name) # by defaulr column type are 'factor', to make them string we use 'stringsasfactors=FALSE'
filter(grades, name != 'july') # prints data frame without july as name
murders_nw <- filter(murders, region %in% c("Northeast", "West")) # print rows with region northeast and west
filter(murders, population < 5000000 & region == "Northeast") # another way of multiple condition
my_states <- filter(murders, rate < 1 & (region == 'Northeast' | region == 'West'))
# rank function
x <- c(88, 100, 83, 92, 94)
rank(x)/(-x) # gives rank of elements from lowest to highest. for highest to lowest use '-'
# nrow()
nrow(df1) # counts number of rows
# Plots. These are built in R plotting functions. Most popular package for plotting is ggplot.
plot(x, y) # makes a scatter plot
hist(column_name)
boxplot(col1~col2, data = df1) # creates a box plot comparing col1 and col2. col2 is according to which we are stratifying
# IF loop
if(boolean expression){
expression
} else{
expression
}
# ifelse
no_nas <- ifelse(is.na(na_example), 0, na_example) # first is condition, then TRUE expression, then FALSE expression in one line
sum(is.na(no_nas)) # confirms there are no more NAs in no_nas object
# ANY and ALL
z <- c(true, false, false)
any(z) # will give TRUE. ANY takes a logical vector input and returns TRUE if any one entry is TRUE
all(z) # returns FALSE. ALL returns TRUE if all elements are TRUE and otherwise FALSE
# defining functions
avg <- function(x){ # defines function avg. Can also be function(x,y,z)
s<- sum(x) # objects declared in a function are not saved in the workspace but created and changed only during the call
n <- length(x)
s/n
}
# FOR loop
for(i in 1:5){
print(i)
}
# functions used instead of FOR - apply, sapply, tapply, mapply
# defining a vector(list) variable
a <- vector(length = 25)
# creating LIST. it can contain characters, numbers, vectors and matrix. can contain different data types
list_name <- list("red","green", c(1,2,3), TRUE, 51.23)
list_name <- list(c("JAN","FEB","MAR"), matrix(c(3,4,5,1,-2,5), nrow=2))
names(list_name) <- c("Quarter", "A_Matrix") # gives name to the elements in list list_names i.e kind of column names
unlist(list_name) # this unlist function converts list object into a vector object
# creating MATRIX. Define rownames and colnames before as lists
matrix(c(3,4,5,1,-2,5), nrow=2, byrow=TRUE/FALSE, dimnames = list(rownames, colnames))
print(P[1,3]) # Access the element at 3rd column and 1st row
print(P[2,]) # Access only the 2nd row
print(P[,3]) # Access only the 3rd column
# Working directory
> getwd() #gives current working directory
# Merge and Join
> merge(df1, df2, by = "col_name") # (INNER JOIN)mergers df1 and df2 on the given column
> merge(df1, df2, by = "col_name", all.x = TRUE) # (Left Join)
> merge(df1, df2, by = "col_name", all.y = TRUE) # (Right Join)
> merge(df1, df2, by = "col_name", all = TRUE) # (Full Join)
# R also has IF, IF ELSE, SWITCH, REPEAT, WHILE, FOR
> v <- "hello"
> cnt <- 2
> repeat{
print(v)
cnt<-cnt+1
if(cnt>1){
break
}
}
> v <- "hello"
> cnt <- 2
> while(cnt<7){
print(v)
}
# Functions
> new.function <- function(a){
for (i in 1:a) {
b<-i^2
print(b)
}
}
# R can read json files using RJSON package`
> install.packages("rjson")
> library("rjson") # loads the package into R after install
> new_file <- fromJSON(file = "file_name.json") # importing json file into a R object.
# reading a CSV file into R
> new_file <= read.csv("file_name.csv") # this file should be in current working directory
> data <- subset(df_name, col_name and condition) # subsets the df with the condition on columns
> data <- subset(df1, salary == max(salary)) # prints data for max salary in df1
> data <- subset(df1, salary >600 & dept == "IT")
> wrinte.csv(df1, "output.csv", row.names = FALSE)# writes a new csv file into CWD
# to read XML files
> library("XML") # required library
> library("methods") # required library
> new_file <- xmlParse(file = "file_name.xml")
> rootnode <- xmlRoot(new_file) # extracts the root node from imported xml file
> print(rootnode[1]) # prints the data from first node
> print(rootnode[[1]][[1]]) # first element of first node
> rootsize <- xmlSize(new_file) # finds number of nodes in the root
> xmldf <- xmlToDataFrame("file_name.xml") # converts xml file into dataframe
# RODBC is used to read database from R
> install.packages("RODBC")
> library("RODBC")
> SQL.df <- odbcConnect("sqlserverodbc", uid="", pwd="");
# MEAN
> x <- c(1,2.6,17.5,-21,54,18,4)
> output <- mean(x)
> output <- mean(x, trim = 0.3) # removes 3 values from both ends after sorting the series in ascending order'
> output <- mean(x, na.rm = TRUE) # removes all the 'NA' in series and then calculates MEAN
# MEDIAN
> x <- c(2,5,21.5,9,78)
> output <- median(x) # gives median of the series
# MODE. No inbuilt function. user defined function has to be made
> data.mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Linear Regression model can be done by using ln() function (gives the coefficients)
# glm() is General Linear model used to run any linear model like logistic regression
# Summary
> print(summary(df_name/dataset_name))
# Analysis of variance and ANOVA
> result <- aov(formula_used, data = dataset_name)
> print(anova(result))
# CHI SQUARE Test (MASS library has to be loaded)
> library("MASS")
> print(chisq.test(dataset_name))
# Multiple Regression
- first do linear regression using ln. this will give coefficients(intercept)
- individually itercept can be prnted using coef()
> coef(model)[1] (model is the liner regression one using ln())
# Dicision Tree. Uses Library 'party'
> library("party")
> input.dat <- data_set[c(1:110),] # input the data into new data set/data frame
> png(file = "dicision tree.png") # gives chart file a name
# creatre the tree
> output.tree <- ctree {
> nativespeaker ~ age * shoesize * score,
> data = input.dat}
> plot(output.tree) # this has to be saved using another command "sav.off". is saved in working directory as PNG format



Tremendous things here. I am very glad to look your article.
Thanks so much and I am looking forward to contact you.
Will you please drop me a mail?
Hi there colleagues, how is all, and what you want to say regarding this paragraph, in my view its truly amazing for me.
Do you have any video of that? I’d love to find out more details.