一个被知乎管理员和谐了的“知乎数据抓取程序”(.net、c#数据挖掘)

机器学习 2408 Views

数盟致力于成为最卓越的数据科学社区,聚焦于大数据、分析挖掘、数据可视化领域,业务范围:线下活动、在线课程、猎头服务、项目对接】

作者:wuyidexinsheng

问:能利用爬虫技术做到哪些很酷很有趣很有用的事情?

准备学习python爬虫。各位大神都会用爬虫做哪些有趣的事情?今天突然想玩玩爬虫,就提了这个问题。跟着YouTube上的一个tutor写了个简单的程序,爬了一点豆瓣的数据。主要用到request和bs4(BeautifulSoup)模块。虽然简陋,毕竟是人生中的第一只爬虫啊……以示纪念,代码写在博客里了:我的第一只爬虫:爬取豆瓣读书。

  • <span style=“font-size:10px;”><?xml version=“1.0”?>

  • <configuration>

  • <appSettings>

  • <!–数据有两种存储方式,一种存储于本地程序目录下的Ids.txt,但那只存了问题ID,完整的数据存于oracle数据库中–>

  • <add key=“connStr” value=“Data Source=(DESCRIPTION=(ADDRESS_LIST=(ADDRESS=(PROTOCOL=TCP)(HOST=127.0.0.1)(PORT=1521)))(CONNECT_DATA=(SERVER=DEDICATED)(SERVICE_NAME=DZZH)));User Id=xxxxx;Password=xxxx”/>

  • <!– 设置监测循环时间:秒 –>

  • <add key=“Interval” value=“600”/>

  • <!–设置自动发送信息机器人邮箱–>

  • <add key=“smtpAddress” value=“smtp.163.com”/>

  • <!–用户名–>

  • <add key=“sendEmailFrom” value=“xxxxxxxx@163.com”/>

  • <!–密码–>

  • <add key=“sendEmailFromPwd” value=“xxxxxxxxx”/>

  • <!–接收邮箱地址–>

  • <add key=“strMailAddressTo” value=“xxxxxxxx@163.com,xxxxxxxx@qq.com”/>

  • <!–邮件名称抬头–>

  • <add key=“EmailName” value=“zhApp-家里电脑”/>

  • <!–END–>

  • <!–监测地址–>

  • <add key=“WatchingURL” value=“http://www.zhihu.com/people/wu-xin-sheng-7″/>

  • </appSettings>

  • <startup>

  • <supportedRuntime version=“v4.0″ sku=“.NETFramework,Version=v4.0″/>

  • </startup>

  • </configuration></span>

  • <span class=“name”>伍新生</span><span class=“bio” title=“五颜六色的情感,我毕生的追求!”>五颜六色的情感,我毕生的追求!</span>

  • </div>

  • </div>

  • <div class=“body clearfix”>

  • <div class=“zm-profile-header-avatar-container self”>

  • <img alt=“伍新生”

  • src=“http://pic4.zhimg.com/94cc60166_l.jpg”

  • class=“zm-profile-header-img zg-avatar-big zm-avatar-editor-preview”/>

  • <span class=“zm-entry-head-avatar-edit-button”>修改头像</span>

  • private List<Question> GetQuestions(stringsource)

  • {

  • List<Question> questions =newList<Question>();

  • string startStr = “<a class=\”question_link\””;

  • if(source.IndexOf(startStr) != -1)

  • {

  • Question q =newQuestion();

  • string content = Cinser.Common.StringPlus.SubString(source, startStr, “</a>”);

  • q.Id = Cinser.Common.StringPlus.SubString(content,“question/”, “\””);

  • q.Title = content.Substring(content.IndexOf(“>”) + 1);

  • q.Time = DateTime.Now;

  • questions.Add(q);

  • source = source.Substring(source.IndexOf(startStr) + startStr.Length);

  • questions.AddRange(GetQuestions(source));

  • }

  • returnquestions;

  • }

  • public partial classForm1 : Form

  • {

  • DataProvider dal;

  • string watchingURL = string.Empty;

  • intLoopCount = 0;

  • publicForm1()

  • {

  • InitializeComponent();

  • dal =newDataProvider();

  • dal.AddLog(“程序启动”);

  • base.WindowState = FormWindowState.Minimized;

  • base.Show();

  • base.Hide();

  • base.WindowState = FormWindowState.Normal;

  • base.ShowInTaskbar = false;

  • base.TopMost = false;

  • base.MaximizeBox = false;

  • base.MinimizeBox = false;

  • base.ControlBox = false;

  • //设置循环监测时间

  • int interval = int.Parse(Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“Interval”));

  • timer1.Interval = interval * 1000;

  • watchingURL = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“WatchingURL”);

  • RunWhenStart(true, “zhApp.exe”, “\”” + Application.StartupPath + “\\zhApp.exe\” AutoRun”);

  • Run();

  • dal.AddLog(“程序初始化成功”);

  • LoopCount += 1;

  • }

  • //设置程序开机自启动

  • public void RunWhenStart(bool Started, string name, stringpath)

  • {

  • RegistryKey HKLM = Registry.CurrentUser;

  • RegistryKey Run = HKLM.CreateSubKey(@“SOFTWARE\Microsoft\Windows\CurrentVersion\Run”);

  • if (Started == true)

  • {

  • try

  • {

  • Run.SetValue(name, path);

  • HKLM.Close();

  • }

  • catch(Exception ex)//没有权限会异常

  • {

  • throwex;

  • }

  • }

  • else

  • {

  • try

  • {

  • Run.DeleteValue(name);

  • HKLM.Close();

  • }

  • catch (Exception ex)//没有权限会异常

  • {

  • throwex;

  • }

  • }

  • }

  • /// <summary>

  • /// 运行监测流程

  • /// </summary>

  • private voidRun()

  • {

  • List<Question> questions = GetQuestions(Cinser.Common.HttpHelper.GetString(watchingURL));

  • stringids = dal.GetExistIdsStr();

  • for (inti = 0; i < questions.Count; i++)

  • {

  • if(ids.IndexOf(questions[i].Id) != -1)

  • {

  • questions.Remove(questions[i]);

  • i–;

  • }

  • else

  • {

  • if (ids == string.Empty)

  • ids = questions[i].Id;

  • else

  • ids +=“,”+ questions[i].Id;

  • }

  • }

  • if(questions.Count > 0)

  • {

  • SendQuestions(questions);

  • dal.WriteIdStrToTxt(ids);

  • dal.Add(questions);

  • dal.AddLog(string.Format(“获取了{0}条新数据。”, questions.Count));

  • }

  • }

  • /// <summary>

  • /// 从监测站点源数据中抓取问题

  • /// </summary>

  • private List<Question> GetQuestions(stringsource)

  • {

  • List<Question> questions =newList<Question>();

  • string startStr = “<a class=\”question_link\””;

  • if(source.IndexOf(startStr) != -1)

  • {

  • Question q =newQuestion();

  • string content = Cinser.Common.StringPlus.SubString(source, startStr, “</a>”);

  • q.Id = Cinser.Common.StringPlus.SubString(content,“question/”, “\””);

  • q.Title = content.Substring(content.IndexOf(“>”) + 1);

  • q.Time = DateTime.Now;

  • questions.Add(q);

  • source = source.Substring(source.IndexOf(startStr) + startStr.Length);

  • questions.AddRange(GetQuestions(source));

  • }

  • returnquestions;

  • }

  • private boolSendQuestions(List<Question> questions)

  • {

  • bool bSuccess = true;

  • List<string> strMailAddressTo = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“strMailAddressTo”).Split(‘,’).ToList();

  • string smtpAddress = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“smtpAddress”);

  • string sendEmailFrom = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“sendEmailFrom”);

  • string sendEmailFromPwd = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“sendEmailFromPwd”);

  • string emailName = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“EmailName”);

  • //密码解密,开源的话就去掉这步吧,这样配置的时候直接配置明文密码就行。

  • //sendEmailFromPwd = Cinser.Common.Security.DecryptDES(sendEmailFromPwd, “yuiophgf”);

  • string msg = string.Empty;

  • SendCompletedEventHandler s =newSendCompletedEventHandler(SendCompleted);

  • stringcontent = GetQustionsListStr(questions);

  • content +=“\n\n信息来源于:”+ watchingURL;

  • Cinser.Common.SmtpEmailSend.SendEmail(strMailAddressTo, emailName + DateTime.Now.ToString(), content, smtpAddress, 0x19, sendEmailFrom, sendEmailFromPwd,“163测试邮箱”, null, outmsg, s);

  • returnbSuccess;

  • }

  • private void SendCompleted(objectsender, AsyncCompletedEventArgs e)

  • {

  • }

  • private stringGetQustionsListStr(List<Question> questions)

  • {

  • string content = string.Empty;

  • if (questions != null&& questions.Count > 0)

  • {

  • content =string.Format(“名称:{0},url:{1}”, questions[0].Title, questions[0].Url);

  • for (inti = 1; i < questions.Count; i++)

  • {

  • content +=string.Format(“\n 名称:{0},url:{1}”, questions[i].Title, questions[i].Url);

  • }

  • }

  • returncontent;

  • }

  • private void timer1_Tick(objectsender, EventArgs e)

  • {

  • int interval = int.Parse(Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“Interval”));

  • timer1.Interval = interval * 1000;

  • Run();

  • dal.AddLog(string.Format(“程序循环次数:{0}”, LoopCount++));

  • }

  • }

  • public classQuestion

  • {

  • stringid, title, url, type, remark;

  • DateTime time;

  • public stringRemark

  • {

  • get { returnremark; }

  • set{ remark = value; }

  • }

  • publicDateTime Time

  • {

  • get { returntime; }

  • set{ time = value; }

  • }

  • public stringType

  • {

  • get { returntype; }

  • set{ type = value; }

  • }

  • public stringUrl

  • {

  • get

  • {

  • if (string.IsNullOrEmpty(url))

  • {

  • url =string.Format(“http://www.zhihu.com/question/{0}”, Id);

  • }

  • returnurl;

  • }

  • set{ url = value; }

  • }

  • public stringTitle

  • {

  • get { returntitle; }

  • set{ title = value; }

  • }

  • public stringId

  • {

  • get { returnid; }

  • set{ id = value; }

  • }

  • }

  • /// <summary>

  • /// 知乎问题数据表操作Provider

  • /// </summary>

  • public classDataProvider

  • {

  • private string connStr = string.Empty;

  • Cinser.DBUtility.DAL.OracleDALCommon dal;

  • string txtPath = “”;

  • string logPath = “”;

  • string debugPath = string.Empty;

  • public stringDebugPath

  • {

  • get

  • {

  • if (debugPath == string.Empty)

  • debugPath = System.AppDomain.CurrentDomain.BaseDirectory;

  • if (debugPath.EndsWith(“\\”) == false)

  • {

  • debugPath +=“\\”;

  • }

  • returndebugPath;

  • }

  • set{ debugPath = value; }

  • }

  • public stringLogPath

  • {

  • get

  • {

  • if (logPath == string.Empty)

  • {

  • logPath = DebugPath +“Log.txt”;

  • }

  • returnlogPath;

  • }

  • }

  • publicDataProvider()

  • {

  • dal =new Cinser.DBUtility.DAL.OracleDALCommon(this.ConnStr);

  • }

  • public stringTxtPath

  • {

  • get

  • {

  • if (txtPath == string.Empty)

  • {

  • txtPath = DebugPath +“Ids.txt”;

  • }

  • returntxtPath;

  • }

  • }

  • public stringConnStr

  • {

  • get

  • {

  • if (connStr == string.Empty)

  • {

  • connStr = Cinser.Common.ConfigurationHelper.GetAppSettingsValue(“connStr”);

  • }

  • returnconnStr;

  • }

  • set{ connStr = value; }

  • }

  • public boolCanConnectOracleServer

  • {

  • get

  • {

  • returndal.Open();

  • }

  • }

  • /// <summary>

  • /// 将抓取到的问题写入oracle数据库中

  • /// </summary>

  • /// <param name=”questions”></param>

  • /// <returns></returns>

  • public boolAdd(List<Question> questions)

  • {

  • bool bReturn = false;

  • try

  • {

  • for (inti = 0; i < questions.Count; i++)

  • {

  • dal.Add(“qustions”, questions[i]);

  • }

  • bReturn =true;

  • }

  • catch{ }

  • returnbReturn;

  • }

  • public DataTable GetQustions(string sqlWhere = “1=1″)

  • {

  • try

  • {

  • DataTable dt = dal.GetDataList(“qustions”, sqlWhere);

  • returndt;

  • }

  • catch

  • {

  • return null;

  • }

  • }

  • public bool IsExist(stringid)

  • {

  • try

  • {

  • string sqlWhere = “id='” + id + “‘”;

  • DataTable dt = dal.GetDataList(“qustions”, sqlWhere);

  • returndt.Rows.Count > 0;

  • }

  • catch

  • {

  • return false;

  • }

  • }

  • /// <summary>

  • /// 获取已经抓取过的问题ID字符串

  • /// </summary>

  • /// <returns></returns>

  • public stringGetExistIdsStr()

  • {

  • string ids = string.Empty;

  • //如果能连上远程的oracle服务器则从oracle数据库中取ID字符串

  • if(CanConnectOracleServer)

  • {

  • DataTable dt = GetQustions();

  • if (dt != null&& dt.Rows.Count > 0)

  • {

  • ids = dt.Rows[0][“id”].ToString();

  • for (inti = 1; i < dt.Rows.Count; i++)

  • {

  • ids +=“,” + dt.Rows[i][“id”].ToString();

  • }

  • }

  • }

  • else

  • {//如果能连上远程的oracle服务器关机了,连不上则从本地Ids.txt中取ID字符串

  • if(File.Exists(TxtPath))

  • ids = File.ReadAllText(TxtPath);

  • }

  • returnids;

  • }

  • /// <summary>

  • /// 将最新取到的问题记录至ids.txt中,以此标记这些问题为已读问题

  • /// </summary>

  • /// <param name=”ids”></param>

  • public void WriteIdStrToTxt(stringids)

  • {

  • if (File.Exists(this.TxtPath) == false)

  • File.Create(TxtPath);

  • File.WriteAllText(this.TxtPath, ids);

  • }

  • /// <summary>

  • /// 写程序log,方便错误追踪。

  • /// </summary>

  • /// <param name=”LogMsg”></param>

  • public void AddLog(stringLogMsg)

  • {

  • string logStr = string.Format(“{0}:{1}.\n”, DateTime.Now.ToString(), LogMsg);

  • if (File.Exists(this.LogPath) == false)

  • File.Create(LogPath);

  • string[] logs = File.ReadAllLines(LogPath);

  • if(logs.Length >= 520)

  • File.WriteAllText(LogPath, logStr);

  • else

  • {

  • StreamWriter sw = File.AppendText(LogPath);

  • sw.WriteLine(logStr);

  • sw.Close();

  • }

  • }

  • }

  • —————————————————

    数盟网站:www.dataunion.org

    数盟微博:@数盟社区

    数盟微信:DataScientistUnion

    数盟【大数据群】272089418

    数盟【数据可视化群】 179287077

    数盟【数据分析群】 110875722

    —————————————————

    点击阅读原文,查看原文代码~

    如未说明则本站原创,转载请注明出处:NULL » 一个被知乎管理员和谐了的“知乎数据抓取程序”(.net、c#数据挖掘)