C语言解析html网页中的图片URL

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/weixin_43138570/article/details/102606576

实例:

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <regex.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

static const char * HREF_PATTERN = "<img [^>]*src=\"\\s*\\([^ >\"]*\\)\\s*\"";                                 

int main(int argc, char** argv){
    int n ,len, count;
    regex_t re; 
    const size_t nmatch = 2;
    regmatch_t matchptr[nmatch];
    char *p = NULL;
    char str[1024*1024];

    memset(str, 0, sizeof(str));
    n = 0;
    count = 0;

    //operate file and read contents
    int fd = open(argv[1], O_RDONLY);
    if (fd < 0) {
        printf("file:%s open error\n", argv[1]);
        return -1; 
    }   
    
    while ((n = read(fd, str+count, 1024)) != 0) {
        if (n == -1) {
            printf("file read error\n");
            return -1; 
        }   
    
        count += n;

    }   
    close(fd);
    printf("\nfile read over! begin URL analyse now...\n");

    p = str;

    if (regcomp(&re, HREF_PATTERN, 0) != 0) {/* compile error */
        printf("compile regex error\n");
    }   

    while (regexec(&re, p, nmatch, matchptr, 0) != REG_NOMATCH) {
        len = (matchptr[1].rm_eo - matchptr[1].rm_so);
        p = p + matchptr[1].rm_so;
        char *tmp = (char *)calloc(len+1, 1); 
        strncpy(tmp, p, len);
        tmp[len] = '\0';
        p = p + len + (matchptr[0].rm_eo - matchptr[1].rm_eo);
        printf("%s\n", tmp);
    }   
    
    return 0;
}

运行结果:

gcc test.c -o test
./test www.zol.com.cn_webcenter_map.html 

file read over! begin URL analyse now...
https://dg-fd.zol-img.com.cn/t_s2000x2000/g5/M00/08/00/ChMkJ1YYZveITXKkAAADVZen7iIAADfxQO_-UMAAANt785.png
https://dg-fd.zol-img.com.cn/t_s2000x2000/g4/M00/06/07/Cg-4zFUCTDGIbtftAAAB8xYSy2YAAWoVALpfBUAAAIL673.png
https://dg-fd.zol-img.com.cn/t_s2000x2000/g4/M08/06/08/Cg-4zFUCVDuIXpdBAAABlOjlfjUAAWoXAP__gQAAAH8892.png
https://dg-fd.zol-img.com.cn/t_s2000x2000/g5/M00/08/0C/ChMkJ1ez0HiILQTIAAAPmt4wdOoAAUgUAN2Y3IAAA-y501.png

猜你喜欢

转载自blog.csdn.net/weixin_43138570/article/details/102606576