All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
Example:
Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
Output: ["AAAAACCCCC", "CCCCCAAAAA"]
题目链接:https://leetcode.com/problems/repeated-dna-sequences/
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> ans;
if(s.size()<=10)return ans;
map<string,int> m;
int len=s.size();
int N=10;
m[s.substr(0,N)]++;
for(int i=1;i<=len-N;i++)
{
m[s.substr(i,N)]++;
if(m[s.substr(i,N)]==2)
{
ans.emplace_back(s.substr(i,N));
}
}
return ans;
}
};
后来看了一份别人代码,全用map<string,int>比较慢
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> ans;
if(s.size()<=10)return ans;
unordered_map<long,int> m;
vector<int> vec(26,0);
vec['A'-'A']=1;
vec['C'-'A']=2;
vec['G'-'A']=3;
vec['T'-'A']=4;
int len=s.size();
int N=10;
long sum=0;
for(int i=0;i<N;i++)
{
sum=sum*10+vec[s[i]-'A'];
}
m[sum]=1;
for(int i=10;i<len;i++)
{
sum=(sum%1000000000)*10+vec[s[i]-'A'];
m[sum]++;
if(m[sum]==2)
{
ans.emplace_back(s.substr(i-N+1,N));
}
}
return ans;
}
};
然后我发现了一个有趣的东西
使用unordered_map<long,int> m;程序一般只要44ms
如果使用map就不一样了,慢很多
所以我要弄清楚这两者的差别在哪里 map与unordered_map区别太大了
同时再贴上一份只要8ms的代码,我还没看懂,我会继续看,向优秀的代码学习
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
return find_str(s);
}
private:
const int LEN = 10;
int c2i(char c) {
switch (c) {
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
default: assert(0);
}
return 0;
}
//use 2 bits to indicate A, C, G, and T,
vector<string> find_str(string s) {
// the set in s1 means
bitset<1<<20> s1, s2;
vector<string> r;
uint32_t v = 0; //hash value including all 10 characters
uint32_t mask = (1 << 20) - 1; //use 20 bit mask
cout<<mask<<endl;
for (int i = 0; i < s.length(); i++) {
v = (v << 2) | (c2i(s[i]));
v &= mask;
if (i < LEN-1)
continue;
//cout << s[i] << endl;
if (s2[v]) //exist for more than 2 times, ignore
continue;
if (s1[v]) { //exist once, add to result
r.push_back(s.substr(i-LEN+1, LEN));
//cout << s.substr(i-LEN+1, LEN) << endl;
s2.set(v);
} else
s1.set(v);
}
return r;
}
};