Leetcode 187. Repeated DNA Sequences使用map

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

Example:

Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"

Output: ["AAAAACCCCC", "CCCCCAAAAA"]

题目链接:https://leetcode.com/problems/repeated-dna-sequences/

class Solution {
public:    
    vector<string> findRepeatedDnaSequences(string s) {
     vector<string> ans;
        if(s.size()<=10)return ans;
        map<string,int> m;
        int len=s.size();
        int N=10;
        m[s.substr(0,N)]++;
        for(int i=1;i<=len-N;i++)
        {
            m[s.substr(i,N)]++;
                if(m[s.substr(i,N)]==2)
                {
                    ans.emplace_back(s.substr(i,N));
                }
        }
        return ans;
    }
};

后来看了一份别人代码,全用map<string,int>比较慢

class Solution {
public:    
    vector<string> findRepeatedDnaSequences(string s) {
     vector<string> ans;
        if(s.size()<=10)return ans;
        unordered_map<long,int> m;
        vector<int> vec(26,0);
        vec['A'-'A']=1;
        vec['C'-'A']=2;
        vec['G'-'A']=3;
        vec['T'-'A']=4;
        int len=s.size();
        int N=10;
        long sum=0;
        for(int i=0;i<N;i++)
        {
            sum=sum*10+vec[s[i]-'A'];
        }
        m[sum]=1;
        for(int i=10;i<len;i++)
        {
            sum=(sum%1000000000)*10+vec[s[i]-'A'];
            m[sum]++;
            if(m[sum]==2)
            {
                ans.emplace_back(s.substr(i-N+1,N));
            }
        }
        return ans;
    }
};

然后我发现了一个有趣的东西

使用unordered_map<long,int> m;程序一般只要44ms

如果使用map就不一样了,慢很多

所以我要弄清楚这两者的差别在哪里 map与unordered_map区别太大了

同时再贴上一份只要8ms的代码,我还没看懂,我会继续看,向优秀的代码学习

class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        return find_str(s);
    }
private:
    const int LEN = 10;
    int c2i(char c) {
        switch (c) {
            case 'A': return 0;
            case 'C': return 1;
            case 'G': return 2;
            case 'T': return 3;
            default: assert(0);
        }
        return 0;
    }
    
    //use 2 bits to indicate A, C, G, and T,
    vector<string> find_str(string s) {
        // the set in s1 means 
        bitset<1<<20> s1, s2;
        vector<string> r;
        uint32_t v = 0; //hash value including all 10 characters
        uint32_t mask = (1 << 20) - 1; //use 20 bit mask
        cout<<mask<<endl;
        for (int i = 0; i < s.length(); i++) {
            v = (v << 2) | (c2i(s[i]));
            v &= mask;
            if (i < LEN-1)
                continue;
            //cout << s[i] << endl;
            if (s2[v]) //exist for more than 2 times, ignore
                continue;
            if (s1[v]) { //exist once, add to result
                r.push_back(s.substr(i-LEN+1, LEN));
                //cout << s.substr(i-LEN+1, LEN) << endl;
                s2.set(v);
            } else
                s1.set(v);
        }
        
        return r;
    }
};

猜你喜欢

转载自blog.csdn.net/salmonwilliam/article/details/88254452