获取招聘网站下的HR-Email信息

获取招聘网站下的HR-Email信息

前段时间,按照上面的要求,需要做一个职场黑名单的项目,负责的部分是数据采集,也就是通过对各大招聘网站,按照地区或者其它划分,采集HR的邮箱信息入库,由于采集的网站较多,所以把部分公用的方法放在一个类中,方便调用,下面是对51job的采集,代码如下:

package org.hr.integrity.crawl;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import java.util.HashSet;

import java.util.LinkedList;

import java.util.List;

import java.util.ArrayList;

import java.util.Iterator;

import java.util.Set;

import org.apache.commons.httpclient.NameValuePair;

import org.hr.util.ConnectionUtil;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

/**

* 爬取51job

* @author 72414

*

*/

public class JobsHref {

NameValuePair[] data = null;

static List col = new ArrayList();// 公司主页col

static Example ex = new Example();

//放入到set集合中

static Set list = new HashSet();

public boolean getEmail(String body){//判断email地址

boolean flag=false;

try{

Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}");

Matcher m =p.matcher(body);

if(m.find()){

String email=m.group();

if(!email.equals("club@51job.com")){

flag=true;

System.out.println("email:"+email);

list.add(email);

}

}

}

catch(Exception e){

e.printStackTrace();

}

return flag;

}

@SuppressWarnings({ "static-access", "unused" })

public List getHref(String body, NameValuePair[] data1) throws Exception// 得到招聘网站公司发布的第一页的网址

{

JobsHref jh = new JobsHref();

NameValuePair data[] = {

new NameValuePair("loginname", "2066989394@qq.com"),

new NameValuePair("password", "dir13652") };

if (body != null && !"".equals(body)) {

Document doc = Jsoup.parse(body);//Document doc = jh.requestDocumnet(body);

Elements linksElements = doc.select("[class=el]");

for (Element element : linksElements) {

Elements jobs = element.getElementsByClass("t1");//岗位名称

for (Element ele : jobs) {

Element links = ele.getElementsByTag("a").first();

String href = links.attr("href");

if (href.indexOf("https://") >= 0) {

if (href != null && !"null".equals(href)&& !"".equals(href) && !col.contains(href)) {

col.add(href);

String context = ex.getPostResponseWithHttpClient(href, "GBK");// 得到招聘时应聘的html,这里成为广度优先的第2层

if(!getEmail(context)){

}

}

}

}

}

}

return col;

}

public static String getURLValidate2(String url) {//检测URL

String URL=null;

if(url.length()<=29){

return "";

}

else{

Matcher m =null;

try {

Pattern p = Pattern.compile("https://search.51job.com/list/");

/*

* 51job中和工作有关的网页以http://search\\.51job\\.com/list/开头,

* 截取从工作高级搜索找工作网页得到的超链与上面匹配,返回需要的超链

*/

String suburl=url.trim().substring(0, 30);//

m = p.matcher(suburl);

if (m.find()) {

URL = url;

}

else{

return "";

}

}

catch (Exception e) {

e.printStackTrace();

}

}

return URL;

}

@SuppressWarnings("unused")

public List getHref1(String body, NameValuePair[] data1) throws Exception// 得到招聘分页的网址

{

LinkedList nowpageHref = new LinkedList();

NameValuePair data[] = {

new NameValuePair("loginname", "2066989394@qq.com"),

new NameValuePair("password", "dir13652") };

if (body != null && !"".equals(body)) {

Document doc = Jsoup.parse(body);

Elements linksElements = doc.select("div.p_in>ul>li>a");//得到分页链接

for (Element ele : linksElements) {

String href = getURLValidate2(ele.attr("href")) ;

if (href != null && href.indexOf("https://") >= 0&&!"".equals(href)) {

if (!nowpageHref.contains(href))

nowpageHref.add(href);

}

}

}

return nowpageHref;

}

@SuppressWarnings("static-access")

public static void main(String[] args) throws Exception {

JobsHref js= new JobsHref();

NameValuePair data1[] = {

new NameValuePair("loginname", "2066989394@qq.com"),

new NameValuePair("password", "dir13652") };

String body= ex.getGetResponseWithHttpClient(

"http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=3&providesalary=99&keywordtype=2&curr_page=1&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=01&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14"

,"GBK");// 得到各地区发布的工作的html页面

js.getHref(body, data1);//返回当页工作的网站

List page = js.getHref1(body, data1);//得到招聘分页的网站

Iterator It = page.iterator();

while (It.hasNext()) {

String result = ex.getGetResponseWithHttpClient(It.next(),"GBK");

js.getHref(result, data1);

}

ex.printEmialList();

ConnectionUtil cu = new ConnectionUtil();

for (String str : list) {

cu.addEmail(str.trim());//去掉空格

}

System.out.println("运行完成!");

}

}

下面是公用的代码部分,

package org.hr.integrity.crawl;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.UnsupportedEncodingException;

import java.util.Iterator;

import java.util.LinkedList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.cookie.CookiePolicy;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.methods.PostMethod;

public class Example {

// 获得ConnectionManager,设置相关参数

private static MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();

private static int connectionTimeOut = 20000;

private static int socketTimeOut = 10000;

private static int maxConnectionPerHost = 5;

private static int maxTotalConnections = 40;

// 标志初始化是否完成的flag

private static boolean initialed = true;//设置值为true,2018年6月7日 10:28:09

static List list=new LinkedList();//总邮箱list

// 初始化ConnectionManger的方法

public static void SetPara() {

manager.getParams().setConnectionTimeout(connectionTimeOut);

manager.getParams().setSoTimeout(socketTimeOut);

manager.getParams().setDefaultMaxConnectionsPerHost(

maxConnectionPerHost);

manager.getParams().setMaxTotalConnections(maxTotalConnections);

initialed = true;

}

// 通过get方法获取网页内容

public static String getGetResponseWithHttpClient(String url, String encode) {

HttpClient client = new HttpClient(manager);

if (initialed) {

Example.SetPara();

}

GetMethod get = new GetMethod(url);

get.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告

get.setFollowRedirects(true);

String result = null;

StringBuffer resultBuffer = new StringBuffer();

try {

client.executeMethod(get);

// 在目标页面情况未知的条件下,不推荐使用getResponseBodyAsString()方法

//String strGetResponseBody = post.getResponseBodyAsString();

BufferedReader in = new BufferedReader(new InputStreamReader(get

.getResponseBodyAsStream(), get.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

resultBuffer.append("\n");

}

in.close();

result = resultBuffer.toString();

// iso-8859-1 is the default reading encode

result = Example.ConverterStringCode(resultBuffer

.toString(), get.getResponseCharSet(), encode);

} catch (Exception e) {

e.printStackTrace();

result = "";

} finally {

get.releaseConnection();

}

return result;

}

@SuppressWarnings("resource")

public static void addEmail(String email) throws Exception{

FileOutputStream fos = new FileOutputStream(new File("1_1email.txt"),true);

fos.write(email.getBytes());

}

void printEmialList()throws IOException{

FileOutputStream fos = new FileOutputStream(new File("email.txt"),true);

Iterator it=list.iterator();

System.out.println("生成email");

while(it.hasNext()){

String ema=it.next()+",";

fos.write(ema.getBytes());

}

fos.close();

}

public static String getPostResponseWithHttpClient(String url, String encode) {

HttpClient client = new HttpClient(manager);

if (initialed) {

HttpClientExample.SetPara();

}

PostMethod post = new PostMethod(url);

post.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告

post.setFollowRedirects(false);

StringBuffer resultBuffer = new StringBuffer();

String result = null;

try {

client.executeMethod(post);

BufferedReader in = new BufferedReader(new InputStreamReader(post

.getResponseBodyAsStream(), post.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

resultBuffer.append("\n");

}

in.close();

// iso-8859-1 is the default reading encode

result = Example.ConverterStringCode(resultBuffer

.toString(), post.getResponseCharSet(), encode);

} catch (Exception e) {

e.printStackTrace();

result = "";

} finally {

post.releaseConnection();

}

return result;

}

public static boolean getEmail(String body){

boolean flag=false;

try{

Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}");

Matcher m =p.matcher(body);

if(m.find()){

flag=true;

String email=m.group();

//System.out.println("SSSS:"+email);

if(!list.contains(email)){

list.add(email);

addEmail(email);//将得到的Email加入数据库,这里先加入文本里面

}

}

}

catch(Exception e){

e.printStackTrace();

}

return flag;

}

public static String getPostResponseWithHttpClient (String url,

String encode, NameValuePair[] nameValuePair) throws Exception {

HttpClient client = new HttpClient(manager);

if (initialed) {//

HttpClientExample.SetPara();//初始化ConnectionManger的方法

}

PostMethod post = new PostMethod(url);

post.setRequestBody(nameValuePair);//将表单所有的值设置到PostMethod中

post.getParams().setParameter(//去除警告

"http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);

post.setFollowRedirects(false);//设置此类是否应该自动执行http重定向

String result = null;

StringBuffer resultBuffer = new StringBuffer();

try {

client.executeMethod(post);

BufferedReader in = new BufferedReader(new InputStreamReader(post

.getResponseBodyAsStream(), post.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

resultBuffer.append("\n");

}

in.close();

// iso-8859-1 is the default reading encode

result = Example.ConverterStringCode(resultBuffer.toString(), post.getResponseCharSet(), encode);

//System.out.println("result:"+result.length());

if(getEmail(result)){//验证网址

System.out.println("hasemailurl:"+url);

}

} catch (Exception e) {

e.printStackTrace();

result = "";

} finally {

post.releaseConnection();

}

return result;

}

private static String ConverterStringCode(String source, String srcEncode,

String destEncode) {

if (source != null) {

try {

return new String(source.getBytes(srcEncode), destEncode);

} catch (UnsupportedEncodingException e) {

// TODO Auto-generated catch block

e.printStackTrace();

return "";

}

} else {

return "";

}

}

}

上面的代码是先爬取能获取到的页面,爬到的邮箱先放入一个list里面,爬完之后再放入到数据库中,下面是ConnectionUtil.java中插入到数据库的片段代码:

/**

* 向数据库表添加数据

* @auther yuyu

*/

public boolean addEmail(String em){

boolean result = false;

try {

conn = DriverManager.getConnection(connStr);

String sqlInset = "insert into hrintegrity.email(email) values(?)";

PreparedStatement stmts = conn.prepareStatement(sqlInset);

stmts.setString(1, em);

//这里需要添加判断,email在表中是否存在

int i = stmts.executeUpdate();//执行插入数据操作,返回影响的行数

if(i == 1){

result = true;

}

} catch (Exception e) {

e.printStackTrace();

}finally{

try {

conn.close();

} catch (Exception e) {

e.printStackTrace();

}

}

return result;

}

上面就是一个获取51job的邮箱的完整代码,除了51job外,其它招聘网站的获取方式大同小异,如智联,不同点就是在Example.java中调用的方法不同,而且在采集数据的时候select的标签不一样,需要自己一个一个去尝试。

有问题可以在留言中一起交流。

🎈 相关推荐

有哪些常见的线下营销渠道?
beat365英国在线体育

有哪些常见的线下营销渠道?

📅 07-11 👀 6395
广西兵为何叫“狼兵”?死守桂林14天,2万草鞋兵迎战15万精锐
lol世界各大服务器都叫什么
365体育亚洲官方登录

lol世界各大服务器都叫什么

📅 07-03 👀 1355