设为首页收藏本站

EPS数据狗论坛

 找回密码
 立即注册

QQ登录

只需一步,快速开始

查看: 2142|回复: 0

R语言整理数据表格的关键程序语句

[复制链接]

5

主题

63

金钱

111

积分

入门用户

发表于 2018-7-26 11:23:46 | 显示全部楼层 |阅读模式
1. read csv file into R
  1. > WIOD_GVC <- read.table("E:/r_temp/csv_data/WIOD_result_intermediate_20180122.csv", header = TRUE, sep = ",")
复制代码

2. write R into csv file
  1. > write.csv(regression_data,file = "E:/r_temp/regression_data_20180130.csv")
复制代码

3. about the format of the table and the column
  1. > patents_table <- data.frame(patents_table)
  2. > patents_table$year <- as.character(patents_table$year)
  3. > patents_table$year <- as.factor(patents_table$year)
  4. > patents_table$patents <- as.numeric(patents_table$patents)

  5. # The format is quite important. Sometimes, the function doesn't work because of this.
复制代码

4. set and change the column’s name
  1. > colnames(airplane_DEA) <- c("year", "airplane_DEA")

  2. > names(GVC_CHN_4_hightech_ratio_convert)[names(GVC_CHN_4_hightech_ratio_convert)=="Var1"] <- "industries_index"

  3. #change column name "Var1" into "industries_index"
复制代码

5. find the column number whose name is “X2000_VT_middle_divide_uE_hat_AUS”
  1. > star_index <- match("X2000_VT_middle_divide_uE_hat_AUS", names(GVC_CHN_4_hightech))   
  2. > star_index <- match(paste("X",year_i, "_VT_middle_divide_uE_hat_AUS",sep = ""), names(GVC_CHN_4_hightech))
  3. # table<GVC_CHN_4_hightech> is data source
复制代码

6. function of melt()
  1. change wide-format table into long-format table   
  2. > library(reshape2)  # for melt() into long-format, cast() into wide-format   
  3. > GVC_CHN_4_hightech_VT_ratio_convert <- melt(GVC_CHN_4_hightech_VT_ratio, value.name = "new_name")
  4. aaa        bbb        ccc
  5. 1        2        3
  6. 1        2        3
  7. 1        2        3
  8. –> melt –>
  9.         new_name
  10. aaa        1
  11. aaa        1
  12. aaa        1
  13. bbb        2
  14. bbb        2
  15. bbb        2
  16. ccc        3
  17. ccc        3
  18. ccc        3
  19. # dont's forget the "ID"

  20. > cr8_melt <- melt(cr8_pharmaceutical_PC_electronic_equipment_airplane, ID = "year")

  21. # The format of column"year" should be "factor" instead of "numeric". otherwise, the ID does not work at all.  
复制代码

7. add(insert) a new column
  1. #you can use cbind(), but the new column is always last column in the table      
  2. > library(tibble) # for add_column()   
  3. > GVC_CHN_4_hightech_ratio_convert <- add_column(GVC_CHN_4_hightech_ratio_convert, col_nations, .after = 1)

  4. #insert a new column after the table<GVC_CHN_4_hightech_ratio_convert> column 1. The new column name is "col_nations", which has same rows as the table"GVC_CHN_4_hightech_ratio_convert"

  5. #alternatively

  6. > pharmaceutical_DEA_expense_labor$industries <- "pharmaceutical"   

  7. #built up new column whose name is "industries".
  8. The whole column contents is "pharmaceutical".
复制代码

8. extract rows and columns by using subset() function
  1. >  pharmaceutical_subset <- subset(high_tech_DEA_2in_2out, service.unit == "chemical_med_DEA" |service.unit == "traditional_med_DEA" |service.unit == "biological_med_DEA")  

  2. # Extract the sub-table from the table"high_tech_DEA_2in_2out". The extracting condition is the rows must be "traditional_med_DEA" and "biological_med_DEA". Especially for rows

  3. # In addition, we can extract columns by using        
  4. > capital_distance_44 <- capital_distance[,c("country_ISO","km_distance")]
复制代码

9. delete columns from the table
  1. > country_export_risk_44 <- country_export_risk[,-match("full.name", names(country_export_risk))]
复制代码

10. delete rows from the table (conditionally)
  1. >   repeat{
  2.     temp_CHN <- match("CHN", hightech_GVC_DEA_expense_labor_distance_risk_deleted[,"col_nations"] )
  3.     if (is.na(temp_CHN))  {break}
  4.     else {hightech_GVC_DEA_expense_labor_distance_risk_deleted <- hightech_GVC_DEA_expense_labor_distance_risk_deleted[-temp_CHN,]}
  5.   }
复制代码

11. replace contents into new contents
  1. > for(year_i in (2000: 2014)) {
  2.   country_export_risk_44$year <- gsub(paste("X",year_i,sep = ""), year_i, country_export_risk_44$year)
  3.   }
  4.   
  5. #replace the contents of column"year" in table<country_export_risk_44> with new contents.
  6. (X2000, X2001,...,X2014)  --> (2000, 2001,...,2014)
复制代码

12. adding two columns together
  1. > vertical_trade <- regression_data$VS + regression_data$VS1
复制代码

13. aggregate value respectively
  1. > pharmaceutical_DEA <- aggregate(data=pharmaceutical_subset, DEA_2in_2out ~ year, mean, na.rm = TRUE)

  2. # The data source is "pharmaceutical_subset". calculate the mean of "DEA_2in_2out"(column name) for each of corresponding "year"(column name)
复制代码

14. merge two tables into one table
  1. > pharmaceutical_DEA_expense_labor <- merge(pharmaceutical_DEA, pharmaceutical_expense, by="year" )

  2. # The table"pharmaceutical_DEA" and table"pharmaceutical_expense" has same column "year".  Merge these two table according to column"year"

  3. # the columns number of the table <pharmaceutical_DEA> and table<pharmaceutical_expense> can be different.

  4. > value_GVC_CHN_4_hightech_ratio_DEA_expense_labor <- merge(value_GVC_CHN_4_hightech_ratio_convert_industriesname, high_tech_DEA_expense_labor, by.x = c("col_year", "industries"), by.y =c("year", "industries"))

  5. # merge two table by the first table's "col_year" and "industries" and second table's "year" and "industries".

  6. > output_table_2 <- merge(liability_20160101_20180711, industry_1992_2018_stkcd,  by.x = "a_stkcd" , by.y ="stkcd", all.x = TRUE)   # according to table"liability_20160101_20180711" to merge
复制代码

15. fixed effect and random effect

  1. library(plm)

  2. # fixed effect (model = "within")
  3. > result_3a <- plm(VS_middle_divide_uE_hat ~ DEA + CR8  + km_distance + export_risk + GDP + CHN_GDP + patents, data = regression_data2, index = c("country"), model = "within")
  4. > summary(result_3a)
  5. > fixef(result_3a) # the fixed effects
  6. > pFtest(result_3a, result_3)  # (<0.05, fixed effect is effective)

  7. # random effect (model = "random")
  8. > result_3b <- plm(VS_middle_divide_uE_hat ~ DEA + CR8  + km_distance + export_risk + GDP + CHN_GDP + patents, data = regression_data2, index = c("country"), model = "random")
  9. > summary(result_3b)

  10. # Hausman Test
  11. > phtest(result_3a, result_3b)

  12. # null hypothesis: the preferred model is random effects
  13. # alternative hypothesis: the preferred model is fixed effects  (<0.05)
复制代码

16. extract “year” and “season” from “%Y-%m-%d”
  1. # assume date is in format of "%Y-%m-%d", we only want "year" information
  2. > bs_20160101_20180711$enddt <- as.Date(bs_20160101_20180711$enddt, "%Y-%m-%d")
  3. # creat new column of "year"
  4. > bs_20160101_20180711$year <- format(bs_20160101_20180711$enddt,"%Y")
  5. # creat new column of "season"
  6. > industry_20160101_20180711$season <- format(industry_20160101_20180711$infopubdt, "%m")
  7. > industry_20160101_20180711$season[industry_20160101_20180711$season == "01" | industry_20160101_20180711$season == "02" | industry_20160101_20180711$season == "03" ] <- "Q1"
  8. > industry_20160101_20180711$season[industry_20160101_20180711$season == "04" | industry_20160101_20180711$season == "05" | industry_20160101_20180711$season == "06" ] <- "Q2"
  9. > industry_20160101_20180711$season[industry_20160101_20180711$season == "07" | industry_20160101_20180711$season == "08" | industry_20160101_20180711$season == "09" ] <- "Q3"
  10. > industry_20160101_20180711$season[industry_20160101_20180711$season == "10" | industry_20160101_20180711$season == "11" | industry_20160101_20180711$season == "12" ] <- "Q4"
复制代码

17. if the contents are NA or empty, “0” are assigned
  1. >liability_20160101_20180711$debt_asset_ratio[is.na(liability_20160101_20180711$debt_asset_ratio)] <- 0

  2. > x <- c("a", NA, "c", "d", NA)
  3. > is.na(x)   #result: [1] FALSE  TRUE FALSE FALSE  TRUE  --> logical result
  4. > anyNA(x)   #result: [1] TRUE,  if there is no NA, then the result is  "FALSE"

  5. > 1/0  # Inf: Inf is infinity. You can have either positive or negative infinity
  6. > -1/0 # -Inf
  7. > 0/0  # NaN: NaN means Not a Number. It’s an undefined value.
复制代码

18. omit all the rows which contain “NA”
  1. > output_table_4 <- na.omit(output_table_4)  # omit all the rows which contain "NA"
复制代码

19. only extract rows that “csrcicnm2” is not “NA” or empty
  1. # assume "csrcicnm2" is one of the columns of output_table_1
  2. > output_table_1 <- subset(output_table_1, !is.na(csrcicnm2))
复制代码

20. want to know general table situation
  1. # data type
  2. > typeof(output_table_4)   # result: "list"
  3. > typeof(output_table_4$industry_number)   # result:"integer"

  4. > class(output_table_4)   #result: "data.frame"
  5. > class(output_table_4$industry_number)  #result: "integer"

  6. # about type of "factor"
  7. > x <- as.factor(output_table_4$industry_number) # transform data into factor
  8. > nlevels(x)  # result: 19 factors, which is how many levels of factor
  9. > levels(x) #result:  "1"  "2"  "3"  "4"  "5"  "6"  "7"....
  10. > y <- as.integer(x)  #transform factor into integer

  11. # number of rows and columns
  12. > dim(output_table_4) # result: 30073 2,  which means 30073 rows, 2 columns

  13. > nrow(output_table_4) # result: "30073", which means 30073 rows
  14. > ncol(output_table_4) # result: "2", which means 2 columns

  15. > length(output_table_4)  # result: "2", which means 2 columns
  16. > length(output_table_4$industry_number)  #result: "30073", which means 30073 rows

  17. # name of column and row
  18. > colnames(liability_20160101_20180711) # column name
  19. > attributes(output_table_4)  # result: column names, row names, class

  20. # number of characters
  21. > nchar("Software Carpentry")   # result: 18, number of characters

  22. # general situation of table
  23. > summary(output_table_2)
  24. > summary(output_table_2$debt_asset_ratio)

  25. # frequence for what happens
  26. > table(output_table_3$debt_asset_rank)
  27. > table(output_table_3$debt_asset_rank, useNA = "ifany")  #tell us the number of "NA"
  28. > hist(output_table_2$debt_asset_ratio, seq(min(output_table_2$debt_asset_ratio), max(output_table_2$debt_asset_ratio)+0.05 ,by=0.05))     # the data frequency description by histogram
复制代码

21. R Version
  1. > R.version   # check R version
复制代码

22. about A B C D…
  1. > industry_letter <- LETTERS[1:26]  # A,B,C,D...
  2. > industry_letter <- letters[1:26]  # a,b,c,d...
复制代码

23. conditional assign data
  1. > output_table_2$debt_asset_rank[output_table_2$debt_asset_ratio<0] <- rank_order
  2. When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
复制代码


您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

关闭

站长推荐上一条 /1 下一条

客服中心
关闭
在线时间:
周一~周五
8:30-17:30
QQ群:
653541906
联系电话:
010-85786021-8017
在线咨询
客服中心

意见反馈|网站地图|手机版|小黑屋|EPS数据狗论坛 ( 京ICP备09019565号-3 )   

Powered by BFIT! X3.4

© 2008-2028 BFIT Inc.

快速回复 返回顶部 返回列表