R is a free software programming language and software environment for statistical computing and graphics. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.
What R can do ?
Using R Programming Language one can do
- linear and non-linear modelling
- statistical tests
- Time-series analysis
- Classification
- Clustering
Why R ?
Why to use R Programming Language and the answer is because
- R is Free.
- It Provides a powerful way to do statistical analysis on large sets of data.
- New functions and packages are created and updated consistently.
- It has Strong user base.
R Tutorial
# to load a file library(dslabs) # dslabs is a package data(file_name) # imports file_name in R str(file_name) # gives the structure of imported/loaded data set # combination or concat code <- c(380, 124, 818) code country <- c('italy', 'canada', 'egypt') country # names gives the column names of a data set names(code) <- country # assigning names to code list code # basic function class(country) length(country) # numeric vs integer num1 <- 3 num2 <- 3L class(num1) class(num2) # sequence generation li1 <- seq(10,20,2) li2 <- 20:10 li1 li2 # subsetting and slicing code[2] country[c(1,3)] country[2:3] # type casting and chage to numeric or integer using as.numeric x <- c('1','2', 10) x class(x) as.numeric(x) x # sorting, arranges in ascending order temp <- c(2,50,56,87,101) sort(temp) temp # order, returns indexes of the ordered elements ord <- c(34,56,38,887,101) index <- order(ord) index class(index) ord ord[index] # max and whichmax, whichmax gives the index of greatest, SAME FOR MIN max(ord) which.max(ord) ord[which.max(ord)] # rank, gives rank from smalles to greatest rank(ord) # create dataframe state <- c('UP','Gujarat','Bihar','J&K') pop <- c(10,20,NA,39) df1 <- data.frame(states = state, population = pop) df1 # check for NA values i.e. null values ind <- # gives a logical vector i.e. a boolean df ind sum(ind) # counts the number of NA values by summing TRUE in ind as TRUE=1 mean(df1[!ind]) # gives mean of values in df1 which are not NA # logical operators # ind1 <- df$column_name <= 7 # gives a boolean df or vector similar to # df$column_name1[ind1] # gives the values from dataframe satisfying condition # WHICH, MATCH and %IN% functions, they all give out the index numbers of the elements satisfying condition index <- which(df$column == "Brad") df[index] index <- match(c("NY","Florida","Texas"), df$column) df[index] x<- c("a","b","c","d") y<-c("a","b","e") y %in% x #gives a boolean output c("boston","dakota","washington") %in% df$column_name # checks if three items are in column_name or not and returns boolean ind <- which(!abbs %in% murders$abb) # manipulating data tables and advanced analysis can be done using the 'DPLYR' package(for working with tables)! df1 <- mutate(df1,col4=........) # adds column col4 to df1. mutate is used to add columns head(df1) # prints first 6 rows of df1 filter(df1, rate <=0.7) # filter prints rows satisfying condition given from df1 new_table <- select(df1, col1,col2) # only selects col1 and col2 from df1 and makes new data frame as new_table df1 %>% select(col2, col3, col4) %>% filter(col4 <= 0.7) # pipe operator can combine diffrent conditions into one filter(murders, rate < 1 & (region == 'Northeast' | region == 'West')) %>% select(state, rate, rank) # creating data frame grades = data.frame(name = c("aug","july","june"), exam = c(95, 96, 97), exam2 = c(10, 20, 30), stringsasfactors = FALSE) # this makes columns type as character class(gades$name) # by defaulr column type are 'factor', to make them string we use 'stringsasfactors=FALSE' filter(grades, name != 'july') # prints data frame without july as name murders_nw <- filter(murders, region %in% c("Northeast", "West")) # print rows with region northeast and west filter(murders, population < 5000000 & region == "Northeast") # another way of multiple condition my_states <- filter(murders, rate < 1 & (region == 'Northeast' | region == 'West')) # rank function x <- c(88, 100, 83, 92, 94) rank(x)/(-x) # gives rank of elements from lowest to highest. for highest to lowest use '-' # nrow() nrow(df1) # counts number of rows # Plots. These are built in R plotting functions. Most popular package for plotting is ggplot. plot(x, y) # makes a scatter plot hist(column_name) boxplot(col1~col2, data = df1) # creates a box plot comparing col1 and col2. col2 is according to which we are stratifying # IF loop if(boolean expression){ expression } else{ expression } # ifelse no_nas <- ifelse(, 0, na_example) # first is condition, then TRUE expression, then FALSE expression in one line sum( # confirms there are no more NAs in no_nas object # ANY and ALL z <- c(true, false, false) any(z) # will give TRUE. ANY takes a logical vector input and returns TRUE if any one entry is TRUE all(z) # returns FALSE. ALL returns TRUE if all elements are TRUE and otherwise FALSE # defining functions avg <- function(x){ # defines function avg. Can also be function(x,y,z) s<- sum(x) # objects declared in a function are not saved in the workspace but created and changed only during the call n <- length(x) s/n } # FOR loop for(i in 1:5){ print(i) } # functions used instead of FOR - apply, sapply, tapply, mapply # defining a vector(list) variable a <- vector(length = 25) # creating LIST. it can contain characters, numbers, vectors and matrix. can contain different data types list_name <- list("red","green", c(1,2,3), TRUE, 51.23) list_name <- list(c("JAN","FEB","MAR"), matrix(c(3,4,5,1,-2,5), nrow=2)) names(list_name) <- c("Quarter", "A_Matrix") # gives name to the elements in list list_names i.e kind of column names unlist(list_name) # this unlist function converts list object into a vector object # creating MATRIX. Define rownames and colnames before as lists matrix(c(3,4,5,1,-2,5), nrow=2, byrow=TRUE/FALSE, dimnames = list(rownames, colnames)) print(P[1,3]) # Access the element at 3rd column and 1st row print(P[2,]) # Access only the 2nd row print(P[,3]) # Access only the 3rd column # Working directory > getwd() #gives current working directory # Merge and Join > merge(df1, df2, by = "col_name") # (INNER JOIN)mergers df1 and df2 on the given column > merge(df1, df2, by = "col_name", all.x = TRUE) # (Left Join) > merge(df1, df2, by = "col_name", all.y = TRUE) # (Right Join) > merge(df1, df2, by = "col_name", all = TRUE) # (Full Join) # R also has IF, IF ELSE, SWITCH, REPEAT, WHILE, FOR > v <- "hello" > cnt <- 2 > repeat{ print(v) cnt<-cnt+1 if(cnt>1){ break } } > v <- "hello" > cnt <- 2 > while(cnt<7){ print(v) } # Functions > new.function <- function(a){ for (i in 1:a) { b<-i^2 print(b) } } # R can read json files using RJSON package` > install.packages("rjson") > library("rjson") # loads the package into R after install > new_file <- fromJSON(file = "file_name.json") # importing json file into a R object. # reading a CSV file into R > new_file <= read.csv("file_name.csv") # this file should be in current working directory > data <- subset(df_name, col_name and condition) # subsets the df with the condition on columns > data <- subset(df1, salary == max(salary)) # prints data for max salary in df1 > data <- subset(df1, salary >600 & dept == "IT") > wrinte.csv(df1, "output.csv", row.names = FALSE)# writes a new csv file into CWD # to read XML files > library("XML") # required library > library("methods") # required library > new_file <- xmlParse(file = "file_name.xml") > rootnode <- xmlRoot(new_file) # extracts the root node from imported xml file > print(rootnode[1]) # prints the data from first node > print(rootnode[[1]][[1]]) # first element of first node > rootsize <- xmlSize(new_file) # finds number of nodes in the root > xmldf <- xmlToDataFrame("file_name.xml") # converts xml file into dataframe # RODBC is used to read database from R > install.packages("RODBC") > library("RODBC") > SQL.df <- odbcConnect("sqlserverodbc", uid="", pwd=""); # MEAN > x <- c(1,2.6,17.5,-21,54,18,4) > output <- mean(x) > output <- mean(x, trim = 0.3) # removes 3 values from both ends after sorting the series in ascending order' > output <- mean(x, na.rm = TRUE) # removes all the 'NA' in series and then calculates MEAN # MEDIAN > x <- c(2,5,21.5,9,78) > output <- median(x) # gives median of the series # MODE. No inbuilt function. user defined function has to be made > data.mode <- function(v) { uniqv <- unique(v) uniqv[which.max(tabulate(match(v, uniqv)))] } # Linear Regression model can be done by using ln() function (gives the coefficients) # glm() is General Linear model used to run any linear model like logistic regression # Summary > print(summary(df_name/dataset_name)) # Analysis of variance and ANOVA > result <- aov(formula_used, data = dataset_name) > print(anova(result)) # CHI SQUARE Test (MASS library has to be loaded) > library("MASS") > print(chisq.test(dataset_name)) # Multiple Regression - first do linear regression using ln. this will give coefficients(intercept) - individually itercept can be prnted using coef() > coef(model)[1] (model is the liner regression one using ln()) # Dicision Tree. Uses Library 'party' > library("party") > input.dat <- data_set[c(1:110),] # input the data into new data set/data frame > png(file = "dicision tree.png") # gives chart file a name # creatre the tree > output.tree <- ctree { > nativespeaker ~ age * shoesize * score, > data = input.dat} > plot(output.tree) # this has to be saved using another command "". is saved in working directory as PNG format
