NiceLeeのBlog 用爱发电 bilibili~

Java htmlUnit动态爬取页面实例

2018-10-28
nIceLee

阅读:


接上上篇博文,当时使用jsoup爬的页面是最初的纯静态页面, 现用htmlUnit重新加以实现

关键代码

如下,setJavaScriptEnabled设置成true,让其运行一段时间后,返回一个HtmlPage网页,接下来的工作便是对网页进行解析

private HtmlPage visitWebByUrl(String url) throws IOException, MalformedURLException {
		//构造一个webClient 模拟Chrome 浏览器
		WebClient webClient = new WebClient(BrowserVersion.CHROME);
		//屏蔽日志信息
		LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",
				"org.apache.commons.logging.impl.NoOpLog");
		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
		//支持JavaScript
		webClient.getOptions().setJavaScriptEnabled(true);
		webClient.getOptions().setActiveXNative(false);
		webClient.getOptions().setCssEnabled(false);
		webClient.getOptions().setThrowExceptionOnScriptError(false);
		webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
		webClient.getOptions().setTimeout(3000);
		HtmlPage rootPage = webClient.getPage(url);
		//设置一个运行JavaScript的时间
		webClient.waitForBackgroundJavaScript(1000);
		return rootPage;
	}

解析工作

其实是单纯的前端都会的东西…

private List<Song> parseDomToSongList(HtmlPage rootPage) {
		List<Song> lstSong = new ArrayList<>();
		final DomNodeList<DomNode> divs = rootPage.querySelectorAll("div.songlist__item");
		for (DomNode div : divs) {
			Song song = new Song();
			HtmlElement ele = div.querySelector(".songlist__songname_txt a");
			
		    song.mid = ele.getAttribute("href")
		    		.replace("https://y.qq.com/n/yqq/song/", "")
		    		.replace(".html", "");
		    println(song.mid);
		    song.name = ele.getAttribute("title").split(" ")[0];
		    println(song.name);
		    
		    ele = div.querySelector(".songlist__artist a");
		    //println(ele.asXml());
		    song.singer = ele.getAttribute("title");
		    println(song.singer);
		    song.singermid = ele.getAttribute("data-singermid");
		    println(song.singermid);
		    
		    ele = div.querySelector(".songlist__album a");
		    //println(ele.asXml());
		    song.albumMid = ele.getAttribute("data-albummid");
		    println(song.albumMid);
		    song.album = ele.getAttribute("title");
		    println(song.album);
		    lstSong.add(song);
		}
		return lstSong;
	}

##测试 做完后稍微测了一下,功能ok

	@Test //访问关键词结果页面
	public void test() {
		logOn = false;
		String url = genQueryUrl("战 排骨教主");
		println(new Date().toString());
		try {
			HtmlPage rootPage = visitWebByUrl(url);
			parseDomToSongList(rootPage);
		} catch (FailingHttpStatusCodeException | IOException e) {
			e.printStackTrace();
		}
		println(new Date().toString());
	}
	
	@Test ////访问歌曲页面,获取  专辑图片\歌词
	public void test2() {
		logOn = false;
		printNowTime();
		String url = String.format("https://y.qq.com/n/yqq/song/%s.html", "000wDA7M23CRIf");
		println(new Date().toString());
		try {
			HtmlPage rootPage = visitWebByUrl(url);
			
			//专辑图片\歌曲链接\歌词
			HtmlElement ele = rootPage.querySelector("img.data__photo");
			println(ele.getAttribute("src"));
			ele = rootPage.getHtmlElementById("lrc_content");
			print(ele.asText());
			
		} catch (FailingHttpStatusCodeException | IOException e) {
			e.printStackTrace();
		}
		printNowTime();
	}

内容
隐藏