前言

我突发奇想,想用C++写一个爬虫(真该死,为啥我的想法这么丰富呢)于是在收集大量资料后,成功写出了支持HTTP和HTTPS协议请求的代码,并能支持解析请求内容
用的OpenSSL和Socket实现的HTTPS请求

支持库

      <cstring>
      <string>
      <sys/socket.h>
      <netinet/in.h>
      <netdb.h>
      <vector>
      <unistd.h>
      <fcntl.h>
      <chrono>
      <sys/time.h>
      <openssl/ssl.h>
      <openssl/err.h>
(知道你们不想一个一个抄写,这里提供复制)

#include <cstring>
#include <string>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <vector>
#include <unistd.h>
#include <fcntl.h>
#include <chrono>
#include <sys/time.h>
#include <openssl/ssl.h>
#include <openssl/err.h>

代码编写

这里采用面向对象的方式编写,提供了操作接口

mian.h

class HttpsAndHttpRequest
{
private:
    sockaddr_in serverAddr;
    int socketFD = 0;
    int timeOutVal;

    struct UrlStructure
    {
        std::string agreement;
        std::string host;
        hostent *ip;
        std::string port;
        std::string path;
        std::string param;
    };

    UrlStructure urlAnalysis(std::string url);

    // https
    SSL_CTX *ctx = NULL;
    SSL *ssl = NULL;

    std::string httpResponse;

    struct httpProtocol
    {
        std::string allContent;
        std::string option;
        std::string content;
    };
    bool noBlock = true;
    void analysisHttpProtocol();

    std::string httpRequest(std::string url);

    std::string httpsRequest(std::string url);

public:
    bool useHandleProtocol = false;

    HttpsAndHttpRequest(int timeoutSeconds = 5, bool noBlockSet = true) : timeOutVal(timeoutSeconds)
    {
        // SSL初始化
        SSL_library_init();
        OpenSSL_add_all_algorithms();
        SSL_load_error_strings();

        ctx = SSL_CTX_new(SSLv23_client_method());
    
        noBlock = noBlockSet;
    }

    std::string ConnectWebsite(const std::string &url, bool analysis = false);

    void setTimeoutSeconds(int time);

    ~HttpsAndHttpRequest()
    {
        if (ssl)
        {
            SSL_shutdown(ssl);
            SSL_free(ssl);
            ssl = NULL;
        }
        if (ctx)
            SSL_CTX_free(ctx);
        
    }

protected:
    std::vector<httpProtocol> protocol;
    virtual void handleProtocol() = 0;
};

main.cpp

#include "main.h"

HttpsAndHttpRequest::UrlStructure HttpsAndHttpRequest::urlAnalysis(std::string url)
{
    UrlStructure result;
    std::string partText[3] = {"://", "/", "?"};
    int start = 0, end;

    std::string host, path;

    bool isParam = false;
    bool notPath = false;

    for (size_t i = 0; i < 2; i++)
    {
        end = url.find(partText[i], start);
        if (end == std::string::npos)
        {
            if (partText[i + 1] == "/")
                continue;
            end = url.find(partText[i + 1], start);
            if (end == std::string::npos)
            {
                notPath = true;
                break;
            }

            if (partText[i + 1] == "?")
                isParam = true;
        }

        if (i == 0)
            result.agreement = url.substr(start, end - start);
        else if (i == 1)
            host = url.substr(start, end - start);
        else if (i == 2)
            path = url.substr(start, end - start);
        
        start = end + partText[i].length();
    }

    if (start != 0)
    {
        if (notPath)
            host = url.substr(start);
        else
            path = url.substr(start);
    }
    else
    {
        host = url;
    }

    start = 0;
    end = host.find(":", start);
    if (end != std::string::npos)
    {
        result.host = host.substr(start, end);
        result.port = host.substr(end + 1);
    }
    else
        result.host = host;

    end = path.find("?", start);
    if (isParam)
        result.param = path;
    else if (end != std::string::npos)
    {
        result.path = path.substr(start, end);
        result.param = path.substr(end + 1);
    }
    else
        result.path = path;

    result.ip = gethostbyname(result.host.c_str());

    return result;
}

std::string HttpsAndHttpRequest::ConnectWebsite(const std::string &url, bool analysis)
{
    UrlStructure urlResource = urlAnalysis(url);
    bool NoAgreement = false;
    bool existPort = true;
    if (urlResource.agreement != "https" || urlResource.agreement != "http")
    {
        urlResource.agreement = "http";
        NoAgreement = true;
    }

    if (urlResource.port == "")
    {
        existPort = false;
        if (urlResource.agreement == "https")
            urlResource.port = "443";
        else
            urlResource.port = "80";
    }
    std::string urlContent = urlResource.agreement + "://" + urlResource.host + ":" + urlResource.port + "/" + urlResource.path + "?" + urlResource.param;

    std::string result = "";

    if (NoAgreement)
    {
        result = httpsRequest(urlContent);
        if (result.empty())
        {
            if (existPort)
                urlContent = "http://" + urlResource.host + ":80" + "/" + urlResource.path + "?" + urlResource.param;

            result = httpRequest(urlContent);
        }
    }
    else if (urlResource.agreement == "https")
        result = httpsRequest(urlContent);
    else
        result = httpRequest(urlContent);

    if (analysis)
    {
        httpResponse = result;
        analysisHttpProtocol();
    }

    return result;
}

std::string HttpsAndHttpRequest::httpRequest(std::string url)
{
    if (socketFD)
        close(socketFD);

    UrlStructure urlResource = urlAnalysis(url);

    if (urlResource.port.empty())
        urlResource.port = "80";

    if (!urlResource.ip)
        return "";

    socketFD = socket(AF_INET, SOCK_STREAM, 0);
    if (noBlock)
    {
        int flags = fcntl(socketFD, F_GETFL, 0);
        fcntl(socketFD, F_SETFL, flags | O_NONBLOCK);
    }
    // 设置连接地址
    serverAddr.sin_family = AF_INET;
    serverAddr.sin_port = htons(atoi(urlResource.port.c_str()));
    serverAddr.sin_addr = *(in_addr *)urlResource.ip->h_addr_list[0];

    // 构建http请求
    std::string request = "GET /" + urlResource.path + " HTTP/1.1\r\n";
    request += "Host: " + urlResource.host + "\r\n";
    request += "Connection: close\r\n";
    request += "\r\n";

    int nRet = connect(socketFD, (sockaddr *)&serverAddr, sizeof(serverAddr));

    timeval timeout;
    timeout.tv_sec = timeOutVal;

    fd_set wait;
    FD_ZERO(&wait);
    FD_SET(socketFD, &wait);

    nRet = select(socketFD + 1, NULL, &wait, NULL, &timeout);
    if (nRet <= 0)
        return "";

    // 发送HTTP请求
    if (write(socketFD, request.c_str(), strlen(request.c_str())) < 0)
        return "";

    std::string response = "";
    char buffer[1024];
    int len = 0;

    timeout.tv_sec = timeOutVal;
    FD_ZERO(&wait);
    FD_SET(socketFD, &wait);
    while (true)
    {
        int ready = select(socketFD + 1, &wait, NULL, NULL, &timeout);
        if (ready > 0)
        {
            if (FD_ISSET(socketFD, &wait))
            {
                len = read(socketFD, buffer, sizeof(buffer));
                if (len > 0)
                    response.append(buffer, len);
                else
                    break;
            }
        }
        else
            break;
    }
    close(socketFD);
    return response;
}

std::string HttpsAndHttpRequest::httpsRequest(std::string url)
{
    if (socketFD)
        close(socketFD);

    UrlStructure urlResource = urlAnalysis(url);
    if (urlResource.port.empty())
    {
        urlResource.port = "443";
    }

    if (!urlResource.ip)
    {
        return "";
    }

    socketFD = socket(AF_INET, SOCK_STREAM, 0);
    if (noBlock)
    {
        int flags = fcntl(socketFD, F_GETFL, 0);
        fcntl(socketFD, F_SETFL, flags | O_NONBLOCK);
    }

    // 设置连接地址
    serverAddr.sin_family = AF_INET;
    serverAddr.sin_port = htons(atoi(urlResource.port.c_str()));
    serverAddr.sin_addr = *(in_addr *)urlResource.ip->h_addr_list[0];

    // 构建http请求
    std::string request = "GET /" + urlResource.path + " HTTP/1.1\r\n";
    request += "Host: " + urlResource.host + "\r\n";
    request += "Connection: close\r\n";
    request += "\r\n";

    int nRet = connect(socketFD, (sockaddr *)&serverAddr, sizeof(serverAddr));

    timeval timeout;
    timeout.tv_sec = timeOutVal;

    fd_set wait;
    FD_ZERO(&wait);
    FD_SET(socketFD, &wait);

    nRet = select(socketFD + 1, NULL, &wait, NULL, &timeout);
    if (nRet <= 0)
        return "";

    SSL_CTX_set_timeout(ctx, timeOutVal);
    ssl = SSL_new(ctx);
    SSL_set_fd(ssl, socketFD);

    timeval startTime;
    gettimeofday(&startTime, NULL);
    timeval nowTime;
    int cutTime = 0;
    while ((nRet = SSL_connect(ssl)) != 1)
    {
        int sslError = SSL_get_error(ssl, nRet);
        if ((sslError != SSL_ERROR_WANT_READ && sslError != SSL_ERROR_WANT_WRITE) || cutTime > timeOutVal)
            return "";
        gettimeofday(&nowTime, NULL);
        cutTime = nowTime.tv_sec - startTime.tv_sec;
        usleep(100000);
    }

    if (SSL_write(ssl, request.c_str(), request.size()) <= 0)
    {
        return "";
    }

    std::string response = "";
    char buffer[1024];
    int len = 0;

    FD_ZERO(&wait);
    FD_SET(socketFD, &wait);

    while (true)
    {
        int ready = select(socketFD + 1, &wait, NULL, NULL, &timeout);
        if (ready > 0)
        {
            if (FD_ISSET(socketFD, &wait))
            {
                len = SSL_read(ssl, buffer, sizeof(buffer));
                if (len > 0)
                    response.append(buffer, len);
                else
                    break;
            }
        }
        else
            break;
    }
    return response;
}

void HttpsAndHttpRequest::analysisHttpProtocol()
{
    if (httpResponse.empty())
        return;
    if (protocol.size() > 0)
    {
        protocol.clear();
        protocol.shrink_to_fit();
    }
    size_t start, end;
    std::string value = "";
    start = 0;
    do
    {
        end = httpResponse.find("\r\n", start);
        value = httpResponse.substr(start, end - start);
        if (value == "")
        {
            start = end + 2;
            value = httpResponse.substr(start);
            protocol.push_back({"request-date : ...", "request-date", value});
            break;
        }
        std::string option = "";
        std::string content = "";
        int tempStart = 0, tempEnd;
        if (start == 0)
        {
            tempStart = value.find(" ", tempStart) + 1;
            option = "request-result";
            tempEnd = value.find(" ", tempStart);
            content = value.substr(tempStart, tempEnd - tempStart);
        }
        else
        {
            tempEnd = value.find(":", tempStart);
            option = value.substr(tempStart, tempEnd - tempStart);
            tempStart = tempEnd + 1;
            content = value.substr(tempStart);
        }
        protocol.push_back({value, option, content});
        start = end + 2;
    } while (start > end);
    useHandleProtocol = false;
}

void HttpsAndHttpRequest::setTimeoutSeconds(int time)
{
    timeOutVal = time;
}

示例

这里涉及到虚函数,没接触过面向对象的游戏玩家(地球Oline)可能不懂
当你想要实例化(初始化)类的时候,如果类里含有虚函数,就必须重写虚函数(virtual关键字),才能进行实例化具体代码是这样

#include "main.cpp"
class HttpsAndHttpRequests : public HttpsAndHttpRequest
{
public:
    //重写那个虚函数
    void handleProtocol()
    {
        //可以进行处理
    }
};

int main(){
    HttpsAndHttpRequests request
    std::string response = request.ConnectWebsite("https://sober-up.cn/");
    std::cout << response << std::endl;
}

总结

问:这个代码还有很多可以优化的地方,为啥不优化一下?
答:作者能力有限,不会优化(懒不想优化,目前够用)

以上就是HTTPS请求的C++实现(Linux)
有不懂的地方在评论区提出

具体函数介绍在下一页