-
Notifications
You must be signed in to change notification settings - Fork 0
/
KEGG_API_FTICR_Mapping.R
56 lines (41 loc) · 1.99 KB
/
KEGG_API_FTICR_Mapping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
### Testing compound name conversion using the KEGG REST API
library(readxl)
library(rvest)
library(dplyr)
output_prefix = "Output"
# Set working directory
setwd("~/Documents/East River/New Comparative Analyses/NonSPE_Neg/")
# Load in molecular file from ftmsRanalysis
mol = read.csv("Processed_NonSPE_Neg_NoP_Mol.csv")
mol = mol[!is.na(mol$MolForm),]
mol = mol[!duplicated(mol$MolForm),]
# Loop through all of the compounds and use the REST API
mf.table = NULL # Empty object to store information
for(i in 1:nrow(mol)){
mf = mol$MolForm[i] # Set current KO number
page = paste0("http://rest.kegg.jp/find/compound/", mf, "/formula") # Find entry on the API
if(length(grep(" ", page)) > 0){
page = gsub(" ", "%20", page)
} # Need to fix spaces in page name
page = read_html(page) # Load the HTML page
if(length(page) == 1){
page = data.frame(CPD = NA, KEGG_MF = NA, Detected_MF = mf)
} else {
page = page %>% html_node("body") %>% html_children() %>% html_text()
page = gsub("'", "", page)
page = read.table(text = page, sep = "\t", header = F)
if((length(which(tolower(page$V2) == tolower(mf))) == 0)){
page = data.frame(CPD = NA, KEGG_MF = NA, Detected_MF = mf)
} else {
mf.loc = which(tolower(page$V2) == tolower(mf)) # Finding exact location of compound
page = data.frame(CPD = page$V1[mf.loc], KEGG_MF = page$V2[mf.loc], Detected_MF = mf) # REporting the CPD of the exact match
} # This loop captures misidentified compounds (i.e., something with an identical partial name)
}
mf.table = rbind(mf.table, page)
print(paste("Just finished with", mf, "which is entry", which(mol$MolForm %in% mf)))
rm(df, page, split.page, mf.loc, j)
} # Loop through each KO; need the number variant because it occasionally times out
# Cleaning up duplicates and missing values
mf.table = mf.table[!is.na(mf.table$CPD),]
# Writing results out
write.table(mf.table, paste0(output_prefix, "_KEGG_CPD_FTICR_Matching.txt"), sep = "\t", quote = F, row.names = F)