我常使用C语言写网络爬虫,能够将网页爬出来,但是,图片却爬不出来,有没有大佬帮解决一下!!!
代码:
#include <stdio.h>
#include<string.h>
#include<WinSock2.h>
#pragma comment(lib,"ws2_32.lib")
/*
网络部分:http url
url 三部分
https://www.baidu.com/?tn=62095104_29_oem_dg&ch=6
1.协议 http 超文本传输协议
2.主机名 www.baidu.com 需要的ip地址 240e:ff:e020:966:0:ff:b042:f296
3.资源名 /?tn=62095104_29_oem_dg&ch=6*/void parseUrl(const char* url, char* host, char* resPath);
void getImgUrl(const char* html, char* imgUrl);typedef struct Spider
{char host[128]; //主机名char resPath[128]; //资源路径SOCKET fd;
}Spider;
//获取资源
void spider_init(Spider* spider, const char* url)
{memset(spider->host, 0, sizeof(spider->host));memset(spider->resPath, 0, sizeof(spider->host));parseUrl(url, spider->host, spider->resPath);
}
//连接到服务器:网络编程
void spider_connect(Spider* spider)
{//打开socket 2.2 确定买什么手机WSADATA wsadata;WSAStartup(MAKEWORD(2, 2), &wsadata);//创建socket 去买手机spider->fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);if (spider->fd == SOCKET_ERROR){printf("create socket falied %d\n", WSAGetLastError());return;}//通过域名获取ip地址HOSTENT* hent = gethostbyname(spider->host);if (!hent){printf("get host ip failed %d\n", WSAGetLastError());return;}//链接服务器SOCKADDR_IN addr;addr.sin_family = AF_INET;addr.sin_port = htons(80); //端口号 http:80 端序memcpy(&addr.sin_addr, hent->h_addr, sizeof(IN_ADDR));if (SOCKET_ERROR == connect(spider->fd, &addr, sizeof(addr))){printf("connect falied %d\n", WSAGetLastError());return;}
}//解析域名
void parseUrl(const char* url, char* host, char* resPath)
{if (!url)return;//https://www.baidu.com/?tn=62095104_29_oem_dg&ch=6// www.baidu.com/?tn=62095104_29_oem_dg&ch=6const char* ph = strstr(url, "//");ph = ph ? ph + 2 : url;//简写puts(ph);const char* pp = strstr(ph, "/");if (!pp){strcpy(host, ph);strcpy(resPath, "/"); //index.html}else{//先获取hoststrncpy(host, ph, pp - ph);//再获取resapathstrcpy(resPath, pp);}
}
//获取网页
void getHtml(Spider* spider)
{//连接到服务器spider_connect(spider);//给服务器发送请求 char header[128] = { 0 };sprintf(header, "GET %s HTTP/1.1\r\n", spider->resPath);sprintf(header + strlen(header), "Host:%s\r\n", spider->host);strcat(header, "Connection:close\r\n");strcat(header, "\r\n");if (SOCKET_ERROR == send(spider->fd, header, strlen(header), 0)) {printf("send failed %d\n", WSAGetLastError());return;}char html[1024 * 5] = { 0 };//获取网页int len = recv(spider->fd, html, 1024 * 5, 0);if (len == SOCKET_ERROR) {printf("recv failed %d\n", WSAGetLastError());}else{//保存到文件FILE* fp = fopen("maye.html", "w");{if (!fp)return;}fwrite(html, sizeof(char), strlen(html), fp);fclose(fp);printf("%s\n", html);}char imgUrl[128] = { 0 };getImgUrl(html, imgUrl); // 假设 getImgUrl 函数已经定义...Spider sp;spider_init(&sp, imgUrl); // 初始化 Spider 对象spider_connect(&sp);sprintf(header, "GET %s HTTP/1.1\r\n", sp.resPath);sprintf(header + strlen(header), "Host:%s\r\n", sp.host);strcat(header, "Connection:close\r\n");strcat(header, "Content-Type: image/jpeg\r\n");strcat(header, "\r\n");puts(header);if (SOCKET_ERROR == send(spider->fd, header, strlen(header), 0)) {printf("send failed %d\n", WSAGetLastError());return;}//获取图片char recvBuf[1024] = { 0 };len = recv(sp.fd, recvBuf, 1023, 0);//查找有没有\r\n\r\nchar* psp = strstr(recvBuf, "\r\n\r\n");if (!psp)return;psp += sizeof("\r\n\r\n");//接收图片数据FILE* fp = fopen("hello.png", "wb");fwrite(psp, sizeof(char), len - (psp - recvBuf), fp);fclose(fp);//继续接受没有接受完毕的while (1){len = recv(sp.fd, recvBuf, 1023, 0);if (len < 0){break;}else{fwrite(recvBuf, sizeof(char), len, fp);}}fclose(fp);printf("%s\n", imgUrl);
}
//获取网页中的图片链接
void getImgUrl(const char* html, char* imgUrl)
{if (!html || !imgUrl)return;char* beg = strstr(html, "<img src=\"");if (!beg){return;}else{printf("\n\n\n\n\n\n\n\n\n\n\n\n\n\n");//puts(beg+10);beg += 10;}//找结尾的双引号char* end = strstr(beg, "\"");if (!end){printf("网页错误\n");}else{strncpy(imgUrl, beg, end - beg);}
}//下载图片int main()
{printf("请输入要爬取的网址>");char url[512] = "http://www.netbian.com";//gets_s(url, 128);puts(url);Spider sp;spider_init(&sp, url);printf("Host:%s resPath:%s\n", sp.host, sp.resPath);getHtml(&sp);getchar();return 0;
}