【数盟致力于成为最卓越的数据科学社区,聚焦于大数据、分析挖掘、数据可视化领域,业务范围:线下活动、在线课程、猎头服务、项目对接】
作者:wuyidexinsheng
问:能利用爬虫技术做到哪些很酷很有趣很有用的事情?
准备学习python爬虫。各位大神都会用爬虫做哪些有趣的事情?今天突然想玩玩爬虫,就提了这个问题。跟着YouTube上的一个tutor写了个简单的程序,爬了一点豆瓣的数据。主要用到request和bs4(BeautifulSoup)模块。虽然简陋,毕竟是人生中的第一只爬虫啊……以示纪念,代码写在博客里了:我的第一只爬虫:爬取豆瓣读书。








<span style=“font-size:10px;”><?xml version=“1.0”?>
<configuration>
<appSettings>
<!–数据有两种存储方式,一种存储于本地程序目录下的Ids.txt,但那只存了问题ID,完整的数据存于oracle数据库中–>
<add key=“connStr” value=“Data Source=(DESCRIPTION=(ADDRESS_LIST=(ADDRESS=(PROTOCOL=TCP)(HOST=127.0.0.1)(PORT=1521)))(CONNECT_DATA=(SERVER=DEDICATED)(SERVICE_NAME=DZZH)));User Id=xxxxx;Password=xxxx”/>
<!– 设置监测循环时间:秒 –>
<add key=“Interval” value=“600”/>
<!–设置自动发送信息机器人邮箱–>
<add key=“smtpAddress” value=“smtp.163.com”/>
<!–用户名–>
<add key=“sendEmailFrom” value=“xxxxxxxx@163.com”/>
<!–密码–>
<add key=“sendEmailFromPwd” value=“xxxxxxxxx”/>
<!–接收邮箱地址–>
<add key=“strMailAddressTo” value=“xxxxxxxx@163.com,xxxxxxxx@qq.com”/>
<!–邮件名称抬头–>
<add key=“EmailName” value=“zhApp-家里电脑”/>
<!–END–>
<!–监测地址–>
<add key=“WatchingURL” value=“http://www.zhihu.com/people/wu-xin-sheng-7″/>
</appSettings>
<startup>
<supportedRuntime version=“v4.0″ sku=“.NETFramework,Version=v4.0″/>
</startup>
</configuration></span>
<span class=“name”>伍新生</span>,<span class=“bio” title=“五颜六色的情感,我毕生的追求!”>五颜六色的情感,我毕生的追求!</span>
</div>
</div>
<div class=“body clearfix”>
<div class=“zm-profile-header-avatar-container self”>
<img alt=“伍新生”
src=“http://pic4.zhimg.com/94cc60166_l.jpg”
class=“zm-profile-header-img zg-avatar-big zm-avatar-editor-preview”/>
<span class=“zm-entry-head-avatar-edit-button”>修改头像</span>
private List<Question> GetQuestions(stringsource)
{
List<Question> questions =newList<Question>();
string startStr = “<a class=\”question_link\””;
if(source.IndexOf(startStr) != -1)
{
Question q =newQuestion();
string content = Cinser.Common.StringPlus.SubString(source, startStr, “</a>”);
q.Id = Cinser.Common.StringPlus.SubString(content,“question/”, “\””);
q.Title = content.Substring(content.IndexOf(“>”) + 1);
q.Time = DateTime.Now;
questions.Add(q);
source = source.Substring(source.IndexOf(startStr) + startStr.Length);
questions.AddRange(GetQuestions(source));
}
returnquestions;
}
public partial classForm1 : Form
{
DataProvider dal;
string watchingURL = string.Empty;
intLoopCount = 0;
publicForm1()
{
InitializeComponent();
dal =newDataProvider();
dal.AddLog(“程序启动”);
base.WindowState = FormWindowState.Minimized;
base.Show();
base.Hide();
base.WindowState = FormWindowState.Normal;
base.ShowInTaskbar = false;
base.TopMost = false;
base.MaximizeBox = false;
base.MinimizeBox = false;
base.ControlBox = false;
//设置循环监测时间
int interval = int.Parse(Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“Interval”));
timer1.Interval = interval * 1000;
watchingURL = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“WatchingURL”);
RunWhenStart(true, “zhApp.exe”, “\”” + Application.StartupPath + “\\zhApp.exe\” AutoRun”);
Run();
dal.AddLog(“程序初始化成功”);
LoopCount += 1;
}
//设置程序开机自启动
public void RunWhenStart(bool Started, string name, stringpath)
{
RegistryKey HKLM = Registry.CurrentUser;
RegistryKey Run = HKLM.CreateSubKey(@“SOFTWARE\Microsoft\Windows\CurrentVersion\Run”);
if (Started == true)
{
try
{
Run.SetValue(name, path);
HKLM.Close();
}
catch(Exception ex)//没有权限会异常
{
throwex;
}
}
else
{
try
{
Run.DeleteValue(name);
HKLM.Close();
}
catch (Exception ex)//没有权限会异常
{
throwex;
}
}
}
/// <summary>
/// 运行监测流程
/// </summary>
private voidRun()
{
List<Question> questions = GetQuestions(Cinser.Common.HttpHelper.GetString(watchingURL));
stringids = dal.GetExistIdsStr();
for (inti = 0; i < questions.Count; i++)
{
if(ids.IndexOf(questions[i].Id) != -1)
{
questions.Remove(questions[i]);
i–;
}
else
{
if (ids == string.Empty)
ids = questions[i].Id;
else
ids +=“,”+ questions[i].Id;
}
}
if(questions.Count > 0)
{
SendQuestions(questions);
dal.WriteIdStrToTxt(ids);
dal.Add(questions);
dal.AddLog(string.Format(“获取了{0}条新数据。”, questions.Count));
}
}
/// <summary>
/// 从监测站点源数据中抓取问题
/// </summary>
private List<Question> GetQuestions(stringsource)
{
List<Question> questions =newList<Question>();
string startStr = “<a class=\”question_link\””;
if(source.IndexOf(startStr) != -1)
{
Question q =newQuestion();
string content = Cinser.Common.StringPlus.SubString(source, startStr, “</a>”);
q.Id = Cinser.Common.StringPlus.SubString(content,“question/”, “\””);
q.Title = content.Substring(content.IndexOf(“>”) + 1);
q.Time = DateTime.Now;
questions.Add(q);
source = source.Substring(source.IndexOf(startStr) + startStr.Length);
questions.AddRange(GetQuestions(source));
}
returnquestions;
}
private boolSendQuestions(List<Question> questions)
{
bool bSuccess = true;
List<string> strMailAddressTo = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“strMailAddressTo”).Split(‘,’).ToList();
string smtpAddress = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“smtpAddress”);
string sendEmailFrom = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“sendEmailFrom”);
string sendEmailFromPwd = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“sendEmailFromPwd”);
string emailName = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“EmailName”);
//密码解密,开源的话就去掉这步吧,这样配置的时候直接配置明文密码就行。
//sendEmailFromPwd = Cinser.Common.Security.DecryptDES(sendEmailFromPwd, “yuiophgf”);
string msg = string.Empty;
SendCompletedEventHandler s =newSendCompletedEventHandler(SendCompleted);
stringcontent = GetQustionsListStr(questions);
content +=“\n\n信息来源于:”+ watchingURL;
Cinser.Common.SmtpEmailSend.SendEmail(strMailAddressTo, emailName + DateTime.Now.ToString(), content, smtpAddress, 0x19, sendEmailFrom, sendEmailFromPwd,“163测试邮箱”, null, outmsg, s);
returnbSuccess;
}
private void SendCompleted(objectsender, AsyncCompletedEventArgs e)
{
}
private stringGetQustionsListStr(List<Question> questions)
{
string content = string.Empty;
if (questions != null&& questions.Count > 0)
{
content =string.Format(“名称:{0},url:{1}”, questions[0].Title, questions[0].Url);
for (inti = 1; i < questions.Count; i++)
{
content +=string.Format(“\n 名称:{0},url:{1}”, questions[i].Title, questions[i].Url);
}
}
returncontent;
}
private void timer1_Tick(objectsender, EventArgs e)
{
int interval = int.Parse(Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“Interval”));
timer1.Interval = interval * 1000;
Run();
dal.AddLog(string.Format(“程序循环次数:{0}”, LoopCount++));
}
}
public classQuestion
{
stringid, title, url, type, remark;
DateTime time;
public stringRemark
{
get { returnremark; }
set{ remark = value; }
}
publicDateTime Time
{
get { returntime; }
set{ time = value; }
}
public stringType
{
get { returntype; }
set{ type = value; }
}
public stringUrl
{
get
{
if (string.IsNullOrEmpty(url))
{
url =string.Format(“http://www.zhihu.com/question/{0}”, Id);
}
returnurl;
}
set{ url = value; }
}
public stringTitle
{
get { returntitle; }
set{ title = value; }
}
public stringId
{
get { returnid; }
set{ id = value; }
}
}
/// <summary>
/// 知乎问题数据表操作Provider
/// </summary>
public classDataProvider
{
private string connStr = string.Empty;
Cinser.DBUtility.DAL.OracleDALCommon dal;
string txtPath = “”;
string logPath = “”;
string debugPath = string.Empty;
public stringDebugPath
{
get
{
if (debugPath == string.Empty)
debugPath = System.AppDomain.CurrentDomain.BaseDirectory;
if (debugPath.EndsWith(“\\”) == false)
{
debugPath +=“\\”;
}
returndebugPath;
}
set{ debugPath = value; }
}
public stringLogPath
{
get
{
if (logPath == string.Empty)
{
logPath = DebugPath +“Log.txt”;
}
returnlogPath;
}
}
publicDataProvider()
{
dal =new Cinser.DBUtility.DAL.OracleDALCommon(this.ConnStr);
}
public stringTxtPath
{
get
{
if (txtPath == string.Empty)
{
txtPath = DebugPath +“Ids.txt”;
}
returntxtPath;
}
}
public stringConnStr
{
get
{
if (connStr == string.Empty)
{
connStr = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“connStr”);
}
returnconnStr;
}
set{ connStr = value; }
}
public boolCanConnectOracleServer
{
get
{
returndal.Open();
}
}
/// <summary>
/// 将抓取到的问题写入oracle数据库中
/// </summary>
/// <param name=”questions”></param>
/// <returns></returns>
public boolAdd(List<Question> questions)
{
bool bReturn = false;
try
{
for (inti = 0; i < questions.Count; i++)
{
dal.Add(“qustions”, questions[i]);
}
bReturn =true;
}
catch{ }
returnbReturn;
}
public DataTable GetQustions(string sqlWhere = “1=1″)
{
try
{
DataTable dt = dal.GetDataList(“qustions”, sqlWhere);
returndt;
}
catch
{
return null;
}
}
public bool IsExist(stringid)
{
try
{
string sqlWhere = “id='” + id + “‘”;
DataTable dt = dal.GetDataList(“qustions”, sqlWhere);
returndt.Rows.Count > 0;
}
catch
{
return false;
}
}
/// <summary>
/// 获取已经抓取过的问题ID字符串
/// </summary>
/// <returns></returns>
public stringGetExistIdsStr()
{
string ids = string.Empty;
//如果能连上远程的oracle服务器则从oracle数据库中取ID字符串
if(CanConnectOracleServer)
{
DataTable dt = GetQustions();
if (dt != null&& dt.Rows.Count > 0)
{
ids = dt.Rows[0][“id”].ToString();
for (inti = 1; i < dt.Rows.Count; i++)
{
ids +=“,” + dt.Rows[i][“id”].ToString();
}
}
}
else
{//如果能连上远程的oracle服务器关机了,连不上则从本地Ids.txt中取ID字符串
if(File.Exists(TxtPath))
ids = File.ReadAllText(TxtPath);
}
returnids;
}
/// <summary>
/// 将最新取到的问题记录至ids.txt中,以此标记这些问题为已读问题
/// </summary>
/// <param name=”ids”></param>
public void WriteIdStrToTxt(stringids)
{
if (File.Exists(this.TxtPath) == false)
File.Create(TxtPath);
File.WriteAllText(this.TxtPath, ids);
}
/// <summary>
/// 写程序log,方便错误追踪。
/// </summary>
/// <param name=”LogMsg”></param>
public void AddLog(stringLogMsg)
{
string logStr = string.Format(“{0}:{1}.\n”, DateTime.Now.ToString(), LogMsg);
if (File.Exists(this.LogPath) == false)
File.Create(LogPath);
string[] logs = File.ReadAllLines(LogPath);
if(logs.Length >= 520)
File.WriteAllText(LogPath, logStr);
else
{
StreamWriter sw = File.AppendText(LogPath);
sw.WriteLine(logStr);
sw.Close();
}
}
}
—————————————————
数盟网站:www.dataunion.org
数盟微信:DataScientistUnion
数盟【大数据群】272089418
数盟【数据可视化群】 179287077
数盟【数据分析群】 110875722
—————————————————
点击阅读原文,查看原文代码~