R语言整理数据表格的关键程序语句

19881103liujia · 发表于 2018-7-26 11:23:46

1. read csv file into R

> WIOD_GVC <- read.table("E:/r_temp/csv_data/WIOD_result_intermediate_20180122.csv", header = TRUE, sep = ",")

复制代码

2. write R into csv file

> write.csv(regression_data,file = "E:/r_temp/regression_data_20180130.csv")

复制代码

3. about the format of the table and the column

> patents_table <- data.frame(patents_table)
> patents_table$year <- as.character(patents_table$year)
> patents_table$year <- as.factor(patents_table$year)
> patents_table$patents <- as.numeric(patents_table$patents)
# The format is quite important. Sometimes, the function doesn't work because of this.

复制代码

4. set and change the column’s name

> colnames(airplane_DEA) <- c("year", "airplane_DEA")
> names(GVC_CHN_4_hightech_ratio_convert)[names(GVC_CHN_4_hightech_ratio_convert)=="Var1"] <- "industries_index"
#change column name "Var1" into "industries_index"

复制代码

5. find the column number whose name is “X2000_VT_middle_divide_uE_hat_AUS”

> star_index <- match("X2000_VT_middle_divide_uE_hat_AUS", names(GVC_CHN_4_hightech))
> star_index <- match(paste("X",year_i, "_VT_middle_divide_uE_hat_AUS",sep = ""), names(GVC_CHN_4_hightech))
# table<GVC_CHN_4_hightech> is data source

复制代码

6. function of melt()

change wide-format table into long-format table
> library(reshape2) # for melt() into long-format, cast() into wide-format
> GVC_CHN_4_hightech_VT_ratio_convert <- melt(GVC_CHN_4_hightech_VT_ratio, value.name = "new_name")
aaa bbb ccc
1 2 3
1 2 3
1 2 3
–> melt –>
new_name
aaa 1
aaa 1
aaa 1
bbb 2
bbb 2
bbb 2
ccc 3
ccc 3
ccc 3
# dont's forget the "ID"
> cr8_melt <- melt(cr8_pharmaceutical_PC_electronic_equipment_airplane, ID = "year")
# The format of column"year" should be "factor" instead of "numeric". otherwise, the ID does not work at all.

复制代码

7. add(insert) a new column

#you can use cbind(), but the new column is always last column in the table
> library(tibble) # for add_column()
> GVC_CHN_4_hightech_ratio_convert <- add_column(GVC_CHN_4_hightech_ratio_convert, col_nations, .after = 1)
#insert a new column after the table<GVC_CHN_4_hightech_ratio_convert> column 1. The new column name is "col_nations", which has same rows as the table"GVC_CHN_4_hightech_ratio_convert"
#alternatively
> pharmaceutical_DEA_expense_labor$industries <- "pharmaceutical"
#built up new column whose name is "industries".
The whole column contents is "pharmaceutical".

复制代码

8. extract rows and columns by using subset() function

> pharmaceutical_subset <- subset(high_tech_DEA_2in_2out, service.unit == "chemical_med_DEA" |service.unit == "traditional_med_DEA" |service.unit == "biological_med_DEA")
# Extract the sub-table from the table"high_tech_DEA_2in_2out". The extracting condition is the rows must be "traditional_med_DEA" and "biological_med_DEA". Especially for rows
# In addition, we can extract columns by using
> capital_distance_44 <- capital_distance[,c("country_ISO","km_distance")]

复制代码

9. delete columns from the table

> country_export_risk_44 <- country_export_risk[,-match("full.name", names(country_export_risk))]

复制代码

10. delete rows from the table (conditionally)

> repeat{
temp_CHN <- match("CHN", hightech_GVC_DEA_expense_labor_distance_risk_deleted[,"col_nations"] )
if (is.na(temp_CHN)) {break}
else {hightech_GVC_DEA_expense_labor_distance_risk_deleted <- hightech_GVC_DEA_expense_labor_distance_risk_deleted[-temp_CHN,]}
}

复制代码

11. replace contents into new contents

> for(year_i in (2000: 2014)) {
country_export_risk_44$year <- gsub(paste("X",year_i,sep = ""), year_i, country_export_risk_44$year)
}
#replace the contents of column"year" in table<country_export_risk_44> with new contents.
(X2000, X2001,...,X2014) --> (2000, 2001,...,2014)

复制代码

12. adding two columns together

> vertical_trade <- regression_data$VS + regression_data$VS1

复制代码

13. aggregate value respectively

> pharmaceutical_DEA <- aggregate(data=pharmaceutical_subset, DEA_2in_2out ~ year, mean, na.rm = TRUE)
# The data source is "pharmaceutical_subset". calculate the mean of "DEA_2in_2out"(column name) for each of corresponding "year"(column name)

复制代码

14. merge two tables into one table

> pharmaceutical_DEA_expense_labor <- merge(pharmaceutical_DEA, pharmaceutical_expense, by="year" )
# The table"pharmaceutical_DEA" and table"pharmaceutical_expense" has same column "year". Merge these two table according to column"year"
# the columns number of the table <pharmaceutical_DEA> and table<pharmaceutical_expense> can be different.
> value_GVC_CHN_4_hightech_ratio_DEA_expense_labor <- merge(value_GVC_CHN_4_hightech_ratio_convert_industriesname, high_tech_DEA_expense_labor, by.x = c("col_year", "industries"), by.y =c("year", "industries"))
# merge two table by the first table's "col_year" and "industries" and second table's "year" and "industries".
> output_table_2 <- merge(liability_20160101_20180711, industry_1992_2018_stkcd, by.x = "a_stkcd" , by.y ="stkcd", all.x = TRUE) # according to table"liability_20160101_20180711" to merge

复制代码

15. fixed effect and random effect

library(plm)
# fixed effect (model = "within")
> result_3a <- plm(VS_middle_divide_uE_hat ~ DEA + CR8 + km_distance + export_risk + GDP + CHN_GDP + patents, data = regression_data2, index = c("country"), model = "within")
> summary(result_3a)
> fixef(result_3a) # the fixed effects
> pFtest(result_3a, result_3) # (<0.05, fixed effect is effective)
# random effect (model = "random")
> result_3b <- plm(VS_middle_divide_uE_hat ~ DEA + CR8 + km_distance + export_risk + GDP + CHN_GDP + patents, data = regression_data2, index = c("country"), model = "random")
> summary(result_3b)
# Hausman Test
> phtest(result_3a, result_3b)
# null hypothesis: the preferred model is random effects
# alternative hypothesis: the preferred model is fixed effects (<0.05)

复制代码

16. extract “year” and “season” from “%Y-%m-%d”

# assume date is in format of "%Y-%m-%d", we only want "year" information
> bs_20160101_20180711$enddt <- as.Date(bs_20160101_20180711$enddt, "%Y-%m-%d")
# creat new column of "year"
> bs_20160101_20180711$year <- format(bs_20160101_20180711$enddt,"%Y")
# creat new column of "season"
> industry_20160101_20180711$season <- format(industry_20160101_20180711$infopubdt, "%m")
> industry_20160101_20180711$season[industry_20160101_20180711$season == "01" | industry_20160101_20180711$season == "02" | industry_20160101_20180711$season == "03" ] <- "Q1"
> industry_20160101_20180711$season[industry_20160101_20180711$season == "04" | industry_20160101_20180711$season == "05" | industry_20160101_20180711$season == "06" ] <- "Q2"
> industry_20160101_20180711$season[industry_20160101_20180711$season == "07" | industry_20160101_20180711$season == "08" | industry_20160101_20180711$season == "09" ] <- "Q3"
> industry_20160101_20180711$season[industry_20160101_20180711$season == "10" | industry_20160101_20180711$season == "11" | industry_20160101_20180711$season == "12" ] <- "Q4"

复制代码

17. if the contents are NA or empty, “0” are assigned

>liability_20160101_20180711$debt_asset_ratio[is.na(liability_20160101_20180711$debt_asset_ratio)] <- 0
> x <- c("a", NA, "c", "d", NA)
> is.na(x) #result: [1] FALSE TRUE FALSE FALSE TRUE --> logical result
> anyNA(x) #result: [1] TRUE, if there is no NA, then the result is "FALSE"
> 1/0 # Inf: Inf is infinity. You can have either positive or negative infinity
> -1/0 # -Inf
> 0/0 # NaN: NaN means Not a Number. It’s an undefined value.

复制代码

18. omit all the rows which contain “NA”

> output_table_4 <- na.omit(output_table_4) # omit all the rows which contain "NA"

复制代码

19. only extract rows that “csrcicnm2” is not “NA” or empty

# assume "csrcicnm2" is one of the columns of output_table_1
> output_table_1 <- subset(output_table_1, !is.na(csrcicnm2))

复制代码

20. want to know general table situation

# data type
> typeof(output_table_4) # result: "list"
> typeof(output_table_4$industry_number) # result:"integer"
> class(output_table_4) #result: "data.frame"
> class(output_table_4$industry_number) #result: "integer"
# about type of "factor"
> x <- as.factor(output_table_4$industry_number) # transform data into factor
> nlevels(x) # result: 19 factors, which is how many levels of factor
> levels(x) #result: "1" "2" "3" "4" "5" "6" "7"....
> y <- as.integer(x) #transform factor into integer
# number of rows and columns
> dim(output_table_4) # result: 30073 2, which means 30073 rows, 2 columns
> nrow(output_table_4) # result: "30073", which means 30073 rows
> ncol(output_table_4) # result: "2", which means 2 columns
> length(output_table_4) # result: "2", which means 2 columns
> length(output_table_4$industry_number) #result: "30073", which means 30073 rows
# name of column and row
> colnames(liability_20160101_20180711) # column name
> attributes(output_table_4) # result: column names, row names, class
# number of characters
> nchar("Software Carpentry") # result: 18, number of characters
# general situation of table
> summary(output_table_2)
> summary(output_table_2$debt_asset_ratio)
# frequence for what happens
> table(output_table_3$debt_asset_rank)
> table(output_table_3$debt_asset_rank, useNA = "ifany") #tell us the number of "NA"
> hist(output_table_2$debt_asset_ratio, seq(min(output_table_2$debt_asset_ratio), max(output_table_2$debt_asset_ratio)+0.05 ,by=0.05)) # the data frequency description by histogram

复制代码

21. R Version

> R.version # check R version

复制代码

22. about A B C D…

> industry_letter <- LETTERS[1:26] # A,B,C,D...
> industry_letter <- letters[1:26] # a,b,c,d...

复制代码

23. conditional assign data

> output_table_2$debt_asset_rank[output_table_2$debt_asset_ratio<0] <- rank_order
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

复制代码

		自动登录	找回密码
密码			立即注册

R语言整理数据表格的关键程序语句

站长推荐 /1