Common algorithm techniques - string matching algorithm

a brief introdction

String matching algorithms are algorithms used to find occurrences of another string (pattern) within a string (text).

Algorithm classification

Brute force matching algorithm (Brute-Force)

The brute force matching algorithm is a simple and direct string matching algorithm, also known as the naive algorithm. It starts at the start of the text, compares character by character with the pattern, and if it encounters a non-matching character, slides the pattern to the right to continue matching.

#include <iostream>
#include <string>

int bruteForceSearch(const std::string& text, const std::string& pattern) {
    
    
    int n = text.length();
    int m = pattern.length();

    for (int i = 0; i <= n - m; i++) {
    
    
        int j;
        for (j = 0; j < m; j++) {
    
    
            if (text[i + j] != pattern[j]) {
    
    
                break;
            }
        }

        if (j == m) {
    
    
            return i;  // 匹配成功,返回匹配位置
        }
    }

    return -1;  // 匹配失败,返回-1
}

int main() {
    
    
    std::string text = "Hello, World!";
    std::string pattern = "World";

    int position = bruteForceSearch(text, pattern);

    if (position != -1) {
    
    
        std::cout << "Pattern found at position " << position << std::endl;
    } else {
    
    
        std::cout << "Pattern not found" << std::endl;
    }

    return 0;
}

KMP algorithm (Knuth-Morris-Pratt)

The KMP algorithm is an efficient string matching algorithm that builds a partial match table (Partial Match Table) by preprocessing pattern strings, and uses this table to implement skip operations during the matching process, thereby reducing the number of comparisons.

#include <iostream>
#include <string>
#include <vector>

void buildPartialMatchTable(const std::string& pattern, std::vector<int>& table) {
    
    
    int m = pattern.length();
    int j = 0;
    table[0] = 0;

    for (int i = 1; i < m; i++) {
    
    
        if (pattern[i] == pattern[j]) {
    
    
            j++;
            table[i] = j;
        } else {
    
    
            if (j != 0) {
    
    
                j = table[j - 1];
                i--;
            } else {
    
    
                table[i] = 0;
            }
        }
    }
}

int kmpSearch(const std::string& text, const std::string& pattern) {
    
    
    int n = text.length();
    int m = pattern.length();
    std::vector<int> table(m, 0);
    buildPartialMatchTable(pattern, table);

    int i = 0, j = 0;
    while (i < n) {
    
    
        if (pattern[j] == text[i]) {
    
    
            j++;
            i++;
        }

        if (j == m) {
    
    
            return i - j;  // 匹配成功,返回匹配位置
        } else if (i < n && pattern[j] != text[i]) {
    
    
            if (j != 0) {
    
    
                j = table[j - 1];
            } else {
    
    
                i++;
            }
        }
    }

    return -1;  // 匹配失败,返回-1
}

int main() {
    
    
    std::string text = "Hello, World!";
    std::string pattern = "World";

    int position = kmpSearch(text, pattern);

    if (position != -1) {
    
    
        std::cout << "Pattern found at position " << position << std::endl;
    } else {
    
    
        std::cout << "Pattern not found" << std::endl;
    }

    return 0;
}

Boyer-Moore algorithm

The Boyer-Moore algorithm is an efficient string matching algorithm that implements skip operations in the matching process by preprocessing pattern strings and utilizing the Bad Character Rule (Bad Character Rule) and the Good Suffix Rule (Good Suffix Rule). , thereby reducing the number of comparisons.

#include <iostream>
#include <string>
#include <vector>
#include <algorithm>

const int ASCII_SIZE = 256;

void buildBadCharacterTable(const std::string& pattern, std::vector<int>& table) {
    
    
    int m = pattern.length();
    std::fill(table.begin(), table.end(), -1);

    for (int i = 0; i < m; i++) {
    
    
        table[pattern[i]] = i;
    }
}

void buildGoodSuffixTable(const std::string& pattern, std::vector<int>& table) {
    
    
    int m = pattern.length();
    std::vector<int> suffix(m, 0);
    int lastPrefixIndex = m;

    for (int i = m - 1; i >= 0; i--) {
    
    
        if (isSuffix(pattern, i + 1, m - 1)) {
    
    
            lastPrefixIndex = i + 1;
        }

        suffix[i] = lastPrefixIndex - i + m - 1;
    }

    for (int i = 0; i < m - 1; i++) {
    
    
        int j = getSuffixIndex(suffix, i);
        if (j != -1) {
    
    
            table[j] = i;
        }
    }
}

int boyerMooreSearch(const std::string& text, const std::string& pattern) {
    
    
    int n = text.length();
    int m = pattern.length();
    std::vector<int> badCharacter(ASCII_SIZE, -1);
    std::vector<int> goodSuffix(m, 0);

    buildBadCharacterTable(pattern, badCharacter);
    buildGoodSuffixTable(pattern, goodSuffix);

    int i = 0;
    while (i <= n - m) {
    
    
        int j = m - 1;
        while (j >= 0 && pattern[j] == text[i + j]) {
    
    
            j--;
        }

        if (j < 0) {
    
    
            return i;  // 匹配成功,返回匹配位置
        } else {
    
    
            int shift = std::max(j - badCharacter[text[i + j]], goodSuffix[j]);
            i += shift;
        }
    }

    return -1;  // 匹配失败,返回-1
}

int main() {
    
    
    std::string text = "Hello, World!";
    std::string pattern = "World";

    int position = boyerMooreSearch(text, pattern);

    if (position != -1) {
    
    
        std::cout << "Pattern found at position " << position << std::endl;
    } else {
    
    
        std::cout << "Pattern not found" << std::endl;
    }

    return 0;
}

Article summary

Depending on the actual application scenario and data scale, choosing an appropriate string matching algorithm can improve matching efficiency. Hope the above examples are helpful! If you have any additional questions, please feel free to ask.

Guess you like

Origin blog.csdn.net/qq_45902301/article/details/131666413