接上上篇博文,当时使用jsoup爬的页面是最初的纯静态页面, 现用htmlUnit重新加以实现
关键代码
如下,setJavaScriptEnabled设置成true,让其运行一段时间后,返回一个HtmlPage网页,接下来的工作便是对网页进行解析
private HtmlPage visitWebByUrl(String url) throws IOException, MalformedURLException {
//构造一个webClient 模拟Chrome 浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//屏蔽日志信息
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",
"org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
//支持JavaScript
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(3000);
HtmlPage rootPage = webClient.getPage(url);
//设置一个运行JavaScript的时间
webClient.waitForBackgroundJavaScript(1000);
return rootPage;
}
解析工作
其实是单纯的前端都会的东西…
private List<Song> parseDomToSongList(HtmlPage rootPage) {
List<Song> lstSong = new ArrayList<>();
final DomNodeList<DomNode> divs = rootPage.querySelectorAll("div.songlist__item");
for (DomNode div : divs) {
Song song = new Song();
HtmlElement ele = div.querySelector(".songlist__songname_txt a");
song.mid = ele.getAttribute("href")
.replace("https://y.qq.com/n/yqq/song/", "")
.replace(".html", "");
println(song.mid);
song.name = ele.getAttribute("title").split(" ")[0];
println(song.name);
ele = div.querySelector(".songlist__artist a");
//println(ele.asXml());
song.singer = ele.getAttribute("title");
println(song.singer);
song.singermid = ele.getAttribute("data-singermid");
println(song.singermid);
ele = div.querySelector(".songlist__album a");
//println(ele.asXml());
song.albumMid = ele.getAttribute("data-albummid");
println(song.albumMid);
song.album = ele.getAttribute("title");
println(song.album);
lstSong.add(song);
}
return lstSong;
}
##测试 做完后稍微测了一下,功能ok
@Test //访问关键词结果页面
public void test() {
logOn = false;
String url = genQueryUrl("战 排骨教主");
println(new Date().toString());
try {
HtmlPage rootPage = visitWebByUrl(url);
parseDomToSongList(rootPage);
} catch (FailingHttpStatusCodeException | IOException e) {
e.printStackTrace();
}
println(new Date().toString());
}
@Test ////访问歌曲页面,获取 专辑图片\歌词
public void test2() {
logOn = false;
printNowTime();
String url = String.format("https://y.qq.com/n/yqq/song/%s.html", "000wDA7M23CRIf");
println(new Date().toString());
try {
HtmlPage rootPage = visitWebByUrl(url);
//专辑图片\歌曲链接\歌词
HtmlElement ele = rootPage.querySelector("img.data__photo");
println(ele.getAttribute("src"));
ele = rootPage.getHtmlElementById("lrc_content");
print(ele.asText());
} catch (FailingHttpStatusCodeException | IOException e) {
e.printStackTrace();
}
printNowTime();
}