前言
我突发奇想,想用C++写一个爬虫(真该死,为啥我的想法这么丰富呢)于是在收集大量资料后,成功写出了支持HTTP和HTTPS协议请求的代码,并能支持解析请求内容
用的OpenSSL和Socket实现的HTTPS请求
支持库
- <cstring>
- <string>
- <sys/socket.h>
- <netinet/in.h>
- <netdb.h>
- <vector>
- <unistd.h>
- <fcntl.h>
- <chrono>
- <sys/time.h>
- <openssl/ssl.h>
- <openssl/err.h>
#include <cstring> #include <string> #include <sys/socket.h> #include <netinet/in.h> #include <netdb.h> #include <vector> #include <unistd.h> #include <fcntl.h> #include <chrono> #include <sys/time.h> #include <openssl/ssl.h> #include <openssl/err.h>
代码编写
这里采用面向对象的方式编写,提供了操作接口
mian.h
class HttpsAndHttpRequest { private: sockaddr_in serverAddr; int socketFD = 0; int timeOutVal; struct UrlStructure { std::string agreement; std::string host; hostent *ip; std::string port; std::string path; std::string param; }; UrlStructure urlAnalysis(std::string url); // https SSL_CTX *ctx = NULL; SSL *ssl = NULL; std::string httpResponse; struct httpProtocol { std::string allContent; std::string option; std::string content; }; bool noBlock = true; void analysisHttpProtocol(); std::string httpRequest(std::string url); std::string httpsRequest(std::string url); public: bool useHandleProtocol = false; HttpsAndHttpRequest(int timeoutSeconds = 5, bool noBlockSet = true) : timeOutVal(timeoutSeconds) { // SSL初始化 SSL_library_init(); OpenSSL_add_all_algorithms(); SSL_load_error_strings(); ctx = SSL_CTX_new(SSLv23_client_method()); noBlock = noBlockSet; } std::string ConnectWebsite(const std::string &url, bool analysis = false); void setTimeoutSeconds(int time); ~HttpsAndHttpRequest() { if (ssl) { SSL_shutdown(ssl); SSL_free(ssl); ssl = NULL; } if (ctx) SSL_CTX_free(ctx); } protected: std::vector<httpProtocol> protocol; virtual void handleProtocol() = 0; };
main.cpp
#include "main.h" HttpsAndHttpRequest::UrlStructure HttpsAndHttpRequest::urlAnalysis(std::string url) { UrlStructure result; std::string partText[3] = {"://", "/", "?"}; int start = 0, end; std::string host, path; bool isParam = false; bool notPath = false; for (size_t i = 0; i < 2; i++) { end = url.find(partText[i], start); if (end == std::string::npos) { if (partText[i + 1] == "/") continue; end = url.find(partText[i + 1], start); if (end == std::string::npos) { notPath = true; break; } if (partText[i + 1] == "?") isParam = true; } if (i == 0) result.agreement = url.substr(start, end - start); else if (i == 1) host = url.substr(start, end - start); else if (i == 2) path = url.substr(start, end - start); start = end + partText[i].length(); } if (start != 0) { if (notPath) host = url.substr(start); else path = url.substr(start); } else { host = url; } start = 0; end = host.find(":", start); if (end != std::string::npos) { result.host = host.substr(start, end); result.port = host.substr(end + 1); } else result.host = host; end = path.find("?", start); if (isParam) result.param = path; else if (end != std::string::npos) { result.path = path.substr(start, end); result.param = path.substr(end + 1); } else result.path = path; result.ip = gethostbyname(result.host.c_str()); return result; } std::string HttpsAndHttpRequest::ConnectWebsite(const std::string &url, bool analysis) { UrlStructure urlResource = urlAnalysis(url); bool NoAgreement = false; bool existPort = true; if (urlResource.agreement != "https" || urlResource.agreement != "http") { urlResource.agreement = "http"; NoAgreement = true; } if (urlResource.port == "") { existPort = false; if (urlResource.agreement == "https") urlResource.port = "443"; else urlResource.port = "80"; } std::string urlContent = urlResource.agreement + "://" + urlResource.host + ":" + urlResource.port + "/" + urlResource.path + "?" + urlResource.param; std::string result = ""; if (NoAgreement) { result = httpsRequest(urlContent); if (result.empty()) { if (existPort) urlContent = "http://" + urlResource.host + ":80" + "/" + urlResource.path + "?" + urlResource.param; result = httpRequest(urlContent); } } else if (urlResource.agreement == "https") result = httpsRequest(urlContent); else result = httpRequest(urlContent); if (analysis) { httpResponse = result; analysisHttpProtocol(); } return result; } std::string HttpsAndHttpRequest::httpRequest(std::string url) { if (socketFD) close(socketFD); UrlStructure urlResource = urlAnalysis(url); if (urlResource.port.empty()) urlResource.port = "80"; if (!urlResource.ip) return ""; socketFD = socket(AF_INET, SOCK_STREAM, 0); if (noBlock) { int flags = fcntl(socketFD, F_GETFL, 0); fcntl(socketFD, F_SETFL, flags | O_NONBLOCK); } // 设置连接地址 serverAddr.sin_family = AF_INET; serverAddr.sin_port = htons(atoi(urlResource.port.c_str())); serverAddr.sin_addr = *(in_addr *)urlResource.ip->h_addr_list[0]; // 构建http请求 std::string request = "GET /" + urlResource.path + " HTTP/1.1\r\n"; request += "Host: " + urlResource.host + "\r\n"; request += "Connection: close\r\n"; request += "\r\n"; int nRet = connect(socketFD, (sockaddr *)&serverAddr, sizeof(serverAddr)); timeval timeout; timeout.tv_sec = timeOutVal; fd_set wait; FD_ZERO(&wait); FD_SET(socketFD, &wait); nRet = select(socketFD + 1, NULL, &wait, NULL, &timeout); if (nRet <= 0) return ""; // 发送HTTP请求 if (write(socketFD, request.c_str(), strlen(request.c_str())) < 0) return ""; std::string response = ""; char buffer[1024]; int len = 0; timeout.tv_sec = timeOutVal; FD_ZERO(&wait); FD_SET(socketFD, &wait); while (true) { int ready = select(socketFD + 1, &wait, NULL, NULL, &timeout); if (ready > 0) { if (FD_ISSET(socketFD, &wait)) { len = read(socketFD, buffer, sizeof(buffer)); if (len > 0) response.append(buffer, len); else break; } } else break; } close(socketFD); return response; } std::string HttpsAndHttpRequest::httpsRequest(std::string url) { if (socketFD) close(socketFD); UrlStructure urlResource = urlAnalysis(url); if (urlResource.port.empty()) { urlResource.port = "443"; } if (!urlResource.ip) { return ""; } socketFD = socket(AF_INET, SOCK_STREAM, 0); if (noBlock) { int flags = fcntl(socketFD, F_GETFL, 0); fcntl(socketFD, F_SETFL, flags | O_NONBLOCK); } // 设置连接地址 serverAddr.sin_family = AF_INET; serverAddr.sin_port = htons(atoi(urlResource.port.c_str())); serverAddr.sin_addr = *(in_addr *)urlResource.ip->h_addr_list[0]; // 构建http请求 std::string request = "GET /" + urlResource.path + " HTTP/1.1\r\n"; request += "Host: " + urlResource.host + "\r\n"; request += "Connection: close\r\n"; request += "\r\n"; int nRet = connect(socketFD, (sockaddr *)&serverAddr, sizeof(serverAddr)); timeval timeout; timeout.tv_sec = timeOutVal; fd_set wait; FD_ZERO(&wait); FD_SET(socketFD, &wait); nRet = select(socketFD + 1, NULL, &wait, NULL, &timeout); if (nRet <= 0) return ""; SSL_CTX_set_timeout(ctx, timeOutVal); ssl = SSL_new(ctx); SSL_set_fd(ssl, socketFD); timeval startTime; gettimeofday(&startTime, NULL); timeval nowTime; int cutTime = 0; while ((nRet = SSL_connect(ssl)) != 1) { int sslError = SSL_get_error(ssl, nRet); if ((sslError != SSL_ERROR_WANT_READ && sslError != SSL_ERROR_WANT_WRITE) || cutTime > timeOutVal) return ""; gettimeofday(&nowTime, NULL); cutTime = nowTime.tv_sec - startTime.tv_sec; usleep(100000); } if (SSL_write(ssl, request.c_str(), request.size()) <= 0) { return ""; } std::string response = ""; char buffer[1024]; int len = 0; FD_ZERO(&wait); FD_SET(socketFD, &wait); while (true) { int ready = select(socketFD + 1, &wait, NULL, NULL, &timeout); if (ready > 0) { if (FD_ISSET(socketFD, &wait)) { len = SSL_read(ssl, buffer, sizeof(buffer)); if (len > 0) response.append(buffer, len); else break; } } else break; } return response; } void HttpsAndHttpRequest::analysisHttpProtocol() { if (httpResponse.empty()) return; if (protocol.size() > 0) { protocol.clear(); protocol.shrink_to_fit(); } size_t start, end; std::string value = ""; start = 0; do { end = httpResponse.find("\r\n", start); value = httpResponse.substr(start, end - start); if (value == "") { start = end + 2; value = httpResponse.substr(start); protocol.push_back({"request-date : ...", "request-date", value}); break; } std::string option = ""; std::string content = ""; int tempStart = 0, tempEnd; if (start == 0) { tempStart = value.find(" ", tempStart) + 1; option = "request-result"; tempEnd = value.find(" ", tempStart); content = value.substr(tempStart, tempEnd - tempStart); } else { tempEnd = value.find(":", tempStart); option = value.substr(tempStart, tempEnd - tempStart); tempStart = tempEnd + 1; content = value.substr(tempStart); } protocol.push_back({value, option, content}); start = end + 2; } while (start > end); useHandleProtocol = false; } void HttpsAndHttpRequest::setTimeoutSeconds(int time) { timeOutVal = time; }
示例
这里涉及到虚函数,没接触过面向对象的游戏玩家(地球Oline)可能不懂
当你想要实例化(初始化)类的时候,如果类里含有虚函数,就必须重写虚函数(virtual关键字),才能进行实例化具体代码是这样
#include "main.cpp" class HttpsAndHttpRequests : public HttpsAndHttpRequest { public: //重写那个虚函数 void handleProtocol() { //可以进行处理 } }; int main(){ HttpsAndHttpRequests request std::string response = request.ConnectWebsite("https://sober-up.cn/"); std::cout << response << std::endl; }
总结
问:这个代码还有很多可以优化的地方,为啥不优化一下?
答:作者能力有限,不会优化(懒不想优化,目前够用)
以上就是HTTPS请求的C++实现(Linux)
有不懂的地方在评论区提出
最新评论