基于JavaScript爬取法律文书案由信息

传送:R语言中使用JavaScript

中国裁判文书网:http://wenshu.court.gov.cn/Assets/js/Lawyee.CPWSW.DictData.js

library("RCurl")
library("js")
library("dplyr")
txt<-getURL('http://wenshu.court.gov.cn/Assets/js/Lawyee.CPWSW.DictData.js',.encoding='UTF-8')
txt<-uglify_reformat(txt, beautify = TRUE) #格式化
txt<-esprima_tokenize(txt,range=FALSE,loc=FALSE,comment=FALSE) 

txt<-txt[which(txt$type != 'Punctuator'),] %>% .[4:nrow(.),] #去除标点符号+无用的行
txt<-txt[which(txt$value != 'key'),] %>% .[which(.$value != '"1"' & .$value != '"2"' & .$value != '"3"' & .$value != '"4"'),]	#去除无用的key标签

#更新txt的rownames
rownames(txt)<-order(as.numeric(rownames(txt))) #order返回索引

case_info<-data.frame(id=txt$value[which(txt$value=='id')+1],parentId=txt$value[which(txt$value=='parentId')+1],name=txt$value[which(txt$value=='name')+1],stringsAsFactors = F)
case_info$id<-gsub("\"","",case_info$id)
case_info$parentId<-gsub("\"","",case_info$parentId)
case_info$name<-gsub("\"","",case_info$name)

爬取结果:

猜你喜欢

转载自blog.csdn.net/qq_38984677/article/details/81393212