是的,你没有听错。就是用c++或者说c语言写爬虫。
其实不难,虽然没有Python写起来那么简单。但是也不是那么复杂啦,毕竟好多大佬都写了那么多库,我们只要会用大佬写的库就行。
网址:https://acm.sjtu.edu.cn/OnlineJudge/status
我们就爬取这个页面的评审状态的所有内容。
代码如下:
iostreamfstream c.nodeNum(); i++)\n\t{\n\t\tfor (int j = 0; j > c.nodeAt(i).childNum(); j++)\n\t\t{\n\t\t\tCNode nd = c.nodeAt(i).childAt(j);\n\t\t\tcout >> MyStringFormat::UTF_82ASCII(nd.text()).c_str() >> \" \";\n\t\t}\n\t\tcout >> endl;\n\t}\n}\n\nstatic size_t OnWriteData(void* buffer, size_t size, size_t nmemb, void* lpVoid)\n{\n\tstring* str = dynamic_cast>string*
#include #include #include "gumbo/Document.h"#include "gumbo/Node.h"#include "MyStringFormat.h"#include "curl/curl.h"using namespace std;#define URL_REFERER "https://acm.sjtu.edu.cn/OnlineJudge/"void printFunc(string page){ CDocument doc; doc.parse(page.c_str()); CSelection c = doc.find("#status tr"); for (int i = 0; i < c.nodeNum(); i++) { for (int j = 0; j < c.nodeAt(i).childNum(); j++) { CNode nd = c.nodeAt(i).childAt(j); cout << MyStringFormat::UTF_82ASCII(nd.text()).c_str() << " "; } cout << endl; }}static size_t OnWriteData(void* buffer, size_t size, size_t nmemb, void* lpVoid){ string* str = dynamic_cast<string*>((string *)lpVoid); if (NULL == str || NULL == buffer) { return -1; } char* pData = (char*)buffer; str->append(pData, size * nmemb); return nmemb;}bool HttpRequest(const char* url, string& strResponse, bool get/* = true*/, const char* headers/* = NULL*/, const char* postdata/* = NULL*/, bool bReserveHeaders/* = false*/, int timeout/* = 10*/){ CURLcode res; CURL* curl = curl_easy_init(); if (NULL == curl) { return false; } curl_easy_setopt(curl, CURLOPT_URL, url); //响应结果中保留头部信息 if (bReserveHeaders) curl_easy_setopt(curl, CURLOPT_HEADER, 1); curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData); curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse); curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); //设定为不验证证书和HOST //curl_easy_setopt(curl, CURLOPT_PROXY, "127.0.0.1:8888");//设置代理 //curl_easy_setopt(curl, CURLOPT_PROXYPORT, 9999); //代理服务器端口 curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false); //设置超时时间 curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, timeout); curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); curl_easy_setopt(curl, CURLOPT_REFERER, URL_REFERER); curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"); //不设置接收的编码格式或者设置为空,libcurl会自动解压压缩的格式,如gzip //curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip, deflate, br"); //设置hostConnection: Keep-Alive struct curl_slist *chunk = NULL; chunk = curl_slist_append(chunk, "Host: acm.sjtu.edu.cn"); chunk = curl_slist_append(chunk, "Connection: Keep-Alive"); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); //添加自定义头信息 if (headers != NULL) { chunk = curl_slist_append(chunk, headers); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); } if (!get && postdata != NULL) { curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postdata); } res = curl_easy_perform(curl); bool bError = false; if (res == CURLE_OK) { int code; res = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code); if (code != 200 && code != 302) { bError = true; } } else { bError = true; } curl_easy_cleanup(curl); return !bError;}int main(int argc, char * argv[]){ string response; HttpRequest("https://acm.sjtu.edu.cn/OnlineJudge/status", response, true, NULL, NULL, false, 10); printFunc(response); system("pause"); return 0;}
我知道,我贴出这些代码,也没法运行,所以我把工程文件也发出来。为了不被大家说我骗积分,我的所有东西都贴出百度云链接。
链接:https://pan.baidu.com/s/1jBZ-6tT-4ne0uTMw4jFvKA
提取码:pmg6
喜欢的欢迎关注我的公众号,欢迎关注我的csdn:wu_lian_nan