|
1. read csv file into R
- > WIOD_GVC <- read.table("E:/r_temp/csv_data/WIOD_result_intermediate_20180122.csv", header = TRUE, sep = ",")
复制代码
2. write R into csv file
- > write.csv(regression_data,file = "E:/r_temp/regression_data_20180130.csv")
复制代码
3. about the format of the table and the column
- > patents_table <- data.frame(patents_table)
- > patents_table$year <- as.character(patents_table$year)
- > patents_table$year <- as.factor(patents_table$year)
- > patents_table$patents <- as.numeric(patents_table$patents)
- # The format is quite important. Sometimes, the function doesn't work because of this.
复制代码
4. set and change the column’s name
- > colnames(airplane_DEA) <- c("year", "airplane_DEA")
- > names(GVC_CHN_4_hightech_ratio_convert)[names(GVC_CHN_4_hightech_ratio_convert)=="Var1"] <- "industries_index"
- #change column name "Var1" into "industries_index"
复制代码
5. find the column number whose name is “X2000_VT_middle_divide_uE_hat_AUS”
- > star_index <- match("X2000_VT_middle_divide_uE_hat_AUS", names(GVC_CHN_4_hightech))
- > star_index <- match(paste("X",year_i, "_VT_middle_divide_uE_hat_AUS",sep = ""), names(GVC_CHN_4_hightech))
- # table<GVC_CHN_4_hightech> is data source
复制代码
6. function of melt()
- change wide-format table into long-format table
- > library(reshape2) # for melt() into long-format, cast() into wide-format
- > GVC_CHN_4_hightech_VT_ratio_convert <- melt(GVC_CHN_4_hightech_VT_ratio, value.name = "new_name")
- aaa bbb ccc
- 1 2 3
- 1 2 3
- 1 2 3
- –> melt –>
- new_name
- aaa 1
- aaa 1
- aaa 1
- bbb 2
- bbb 2
- bbb 2
- ccc 3
- ccc 3
- ccc 3
- # dont's forget the "ID"
- > cr8_melt <- melt(cr8_pharmaceutical_PC_electronic_equipment_airplane, ID = "year")
- # The format of column"year" should be "factor" instead of "numeric". otherwise, the ID does not work at all.
复制代码
7. add(insert) a new column
- #you can use cbind(), but the new column is always last column in the table
- > library(tibble) # for add_column()
- > GVC_CHN_4_hightech_ratio_convert <- add_column(GVC_CHN_4_hightech_ratio_convert, col_nations, .after = 1)
- #insert a new column after the table<GVC_CHN_4_hightech_ratio_convert> column 1. The new column name is "col_nations", which has same rows as the table"GVC_CHN_4_hightech_ratio_convert"
- #alternatively
- > pharmaceutical_DEA_expense_labor$industries <- "pharmaceutical"
- #built up new column whose name is "industries".
- The whole column contents is "pharmaceutical".
复制代码
8. extract rows and columns by using subset() function
- > pharmaceutical_subset <- subset(high_tech_DEA_2in_2out, service.unit == "chemical_med_DEA" |service.unit == "traditional_med_DEA" |service.unit == "biological_med_DEA")
- # Extract the sub-table from the table"high_tech_DEA_2in_2out". The extracting condition is the rows must be "traditional_med_DEA" and "biological_med_DEA". Especially for rows
- # In addition, we can extract columns by using
- > capital_distance_44 <- capital_distance[,c("country_ISO","km_distance")]
复制代码
9. delete columns from the table
- > country_export_risk_44 <- country_export_risk[,-match("full.name", names(country_export_risk))]
复制代码
10. delete rows from the table (conditionally)
- > repeat{
- temp_CHN <- match("CHN", hightech_GVC_DEA_expense_labor_distance_risk_deleted[,"col_nations"] )
- if (is.na(temp_CHN)) {break}
- else {hightech_GVC_DEA_expense_labor_distance_risk_deleted <- hightech_GVC_DEA_expense_labor_distance_risk_deleted[-temp_CHN,]}
- }
复制代码
11. replace contents into new contents
- > for(year_i in (2000: 2014)) {
- country_export_risk_44$year <- gsub(paste("X",year_i,sep = ""), year_i, country_export_risk_44$year)
- }
-
- #replace the contents of column"year" in table<country_export_risk_44> with new contents.
- (X2000, X2001,...,X2014) --> (2000, 2001,...,2014)
复制代码
12. adding two columns together
- > vertical_trade <- regression_data$VS + regression_data$VS1
复制代码
13. aggregate value respectively
- > pharmaceutical_DEA <- aggregate(data=pharmaceutical_subset, DEA_2in_2out ~ year, mean, na.rm = TRUE)
- # The data source is "pharmaceutical_subset". calculate the mean of "DEA_2in_2out"(column name) for each of corresponding "year"(column name)
复制代码
14. merge two tables into one table
- > pharmaceutical_DEA_expense_labor <- merge(pharmaceutical_DEA, pharmaceutical_expense, by="year" )
- # The table"pharmaceutical_DEA" and table"pharmaceutical_expense" has same column "year". Merge these two table according to column"year"
- # the columns number of the table <pharmaceutical_DEA> and table<pharmaceutical_expense> can be different.
- > value_GVC_CHN_4_hightech_ratio_DEA_expense_labor <- merge(value_GVC_CHN_4_hightech_ratio_convert_industriesname, high_tech_DEA_expense_labor, by.x = c("col_year", "industries"), by.y =c("year", "industries"))
- # merge two table by the first table's "col_year" and "industries" and second table's "year" and "industries".
- > output_table_2 <- merge(liability_20160101_20180711, industry_1992_2018_stkcd, by.x = "a_stkcd" , by.y ="stkcd", all.x = TRUE) # according to table"liability_20160101_20180711" to merge
复制代码
15. fixed effect and random effect
- library(plm)
- # fixed effect (model = "within")
- > result_3a <- plm(VS_middle_divide_uE_hat ~ DEA + CR8 + km_distance + export_risk + GDP + CHN_GDP + patents, data = regression_data2, index = c("country"), model = "within")
- > summary(result_3a)
- > fixef(result_3a) # the fixed effects
- > pFtest(result_3a, result_3) # (<0.05, fixed effect is effective)
- # random effect (model = "random")
- > result_3b <- plm(VS_middle_divide_uE_hat ~ DEA + CR8 + km_distance + export_risk + GDP + CHN_GDP + patents, data = regression_data2, index = c("country"), model = "random")
- > summary(result_3b)
- # Hausman Test
- > phtest(result_3a, result_3b)
- # null hypothesis: the preferred model is random effects
- # alternative hypothesis: the preferred model is fixed effects (<0.05)
复制代码
16. extract “year” and “season” from “%Y-%m-%d”
- # assume date is in format of "%Y-%m-%d", we only want "year" information
- > bs_20160101_20180711$enddt <- as.Date(bs_20160101_20180711$enddt, "%Y-%m-%d")
- # creat new column of "year"
- > bs_20160101_20180711$year <- format(bs_20160101_20180711$enddt,"%Y")
- # creat new column of "season"
- > industry_20160101_20180711$season <- format(industry_20160101_20180711$infopubdt, "%m")
- > industry_20160101_20180711$season[industry_20160101_20180711$season == "01" | industry_20160101_20180711$season == "02" | industry_20160101_20180711$season == "03" ] <- "Q1"
- > industry_20160101_20180711$season[industry_20160101_20180711$season == "04" | industry_20160101_20180711$season == "05" | industry_20160101_20180711$season == "06" ] <- "Q2"
- > industry_20160101_20180711$season[industry_20160101_20180711$season == "07" | industry_20160101_20180711$season == "08" | industry_20160101_20180711$season == "09" ] <- "Q3"
- > industry_20160101_20180711$season[industry_20160101_20180711$season == "10" | industry_20160101_20180711$season == "11" | industry_20160101_20180711$season == "12" ] <- "Q4"
复制代码
17. if the contents are NA or empty, “0” are assigned
- >liability_20160101_20180711$debt_asset_ratio[is.na(liability_20160101_20180711$debt_asset_ratio)] <- 0
- > x <- c("a", NA, "c", "d", NA)
- > is.na(x) #result: [1] FALSE TRUE FALSE FALSE TRUE --> logical result
- > anyNA(x) #result: [1] TRUE, if there is no NA, then the result is "FALSE"
- > 1/0 # Inf: Inf is infinity. You can have either positive or negative infinity
- > -1/0 # -Inf
- > 0/0 # NaN: NaN means Not a Number. It’s an undefined value.
复制代码
18. omit all the rows which contain “NA”
- > output_table_4 <- na.omit(output_table_4) # omit all the rows which contain "NA"
复制代码
19. only extract rows that “csrcicnm2” is not “NA” or empty
- # assume "csrcicnm2" is one of the columns of output_table_1
- > output_table_1 <- subset(output_table_1, !is.na(csrcicnm2))
复制代码
20. want to know general table situation
- # data type
- > typeof(output_table_4) # result: "list"
- > typeof(output_table_4$industry_number) # result:"integer"
- > class(output_table_4) #result: "data.frame"
- > class(output_table_4$industry_number) #result: "integer"
- # about type of "factor"
- > x <- as.factor(output_table_4$industry_number) # transform data into factor
- > nlevels(x) # result: 19 factors, which is how many levels of factor
- > levels(x) #result: "1" "2" "3" "4" "5" "6" "7"....
- > y <- as.integer(x) #transform factor into integer
- # number of rows and columns
- > dim(output_table_4) # result: 30073 2, which means 30073 rows, 2 columns
- > nrow(output_table_4) # result: "30073", which means 30073 rows
- > ncol(output_table_4) # result: "2", which means 2 columns
- > length(output_table_4) # result: "2", which means 2 columns
- > length(output_table_4$industry_number) #result: "30073", which means 30073 rows
- # name of column and row
- > colnames(liability_20160101_20180711) # column name
- > attributes(output_table_4) # result: column names, row names, class
- # number of characters
- > nchar("Software Carpentry") # result: 18, number of characters
- # general situation of table
- > summary(output_table_2)
- > summary(output_table_2$debt_asset_ratio)
- # frequence for what happens
- > table(output_table_3$debt_asset_rank)
- > table(output_table_3$debt_asset_rank, useNA = "ifany") #tell us the number of "NA"
- > hist(output_table_2$debt_asset_ratio, seq(min(output_table_2$debt_asset_ratio), max(output_table_2$debt_asset_ratio)+0.05 ,by=0.05)) # the data frequency description by histogram
复制代码
21. R Version
- > R.version # check R version
复制代码
22. about A B C D…
- > industry_letter <- LETTERS[1:26] # A,B,C,D...
- > industry_letter <- letters[1:26] # a,b,c,d...
复制代码
23. conditional assign data
- > output_table_2$debt_asset_rank[output_table_2$debt_asset_ratio<0] <- rank_order
- When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
复制代码
|
|