C++ 重复的DNA序列

将DNA序列看作是只包含[‘A’,‘C’,‘G’,‘T’]4个字符的字符串，给一个DNA字符串，找到所有长度为10的且出现超过1次的子串。
例如：
s=“AAAAACCCCCAAAAACCCCCAAAAAGGGTTT”,
Return:[“AAAAACCCCC”,“CCCCCAAAAA”]
s=“AAAAAAAAAA”
Return:[“AAAAAAAAAA”]

方法一：

#include <string>
#include<vector>
#include<map>
class Solution
{
public:
 Solution() {}
 ~Solution() {}
 std::vector<std::string> findRepeatedDnaSequences(std::string s)
 {
  std::vector<std::string> result;
  std::map<std::string, int> word_map;
  for (int i = 0; i < s.length(); i++)
  {
   std::string word = s.substr(i, 10);
   if (word_map.find(word) != word_map.end())
   {
    word_map[word]+=1;
   }
   else
   {
    word_map[word] = 1;
   }
  }
  std::map<std::string, int>::iterator it;
  for (it = word_map.begin(); it != word_map.end(); it++)
  {
   if (it->second > 1)
   {
    result.push_back(it->first);
   }
  }
  return result;
 }
};
int main()
{
 std::string s = "AAAAACCCCCAAAAACCCCCAAAAAGGGTTT";
 Solution solve;
 std::vector<std::string> result = solve.findRepeatedDnaSequences(s);
 for (int i = 0; i < result.size(); i++)
 {
  printf("%s\n", result[i].c_str());
 }
 return 0;
}

运行结果为：

AAAAACCCCC
AAAACCCCCA
AAACCCCCAA
AACCCCCAAA
ACCCCCAAAA
CCCCCAAAAA

方法二：

#include <string>
#include <vector>
int g_hash_map[1048576] = { 0 };
std::string change_int_to_DNA(int DNA)
{
 static const char DNA_CHAR[] = { 'A','C','G','T' };
 std::string str;
 for (int  i = 0; i < 10; i++)
 {
  str += DNA_CHAR[DNA & 3];
  DNA = DNA >> 2;
 }
 return str;
}
class Solution
{
public:
 Solution(){}
 ~Solution(){}
 std::vector<std::string> findRepeatedDnaSequences(std::string s)
 {
  std::vector<std::string> result;
  if (s.length()<10)
  {
   return result;
  }
  for (int i = 0; i < 1048576; i++)
  {
   g_hash_map[i] = 0;
  }
  int char_map[128] = { 0 };
  char_map['A'] = 0;
  char_map['C'] = 1;
  char_map['G'] = 2;
  char_map['T'] = 3;
  int key = 0;
  for (int i = 9; i >=0; i--)
  {
   key = (key << 2) + char_map[s[i]];
  }
  g_hash_map[key] = 1;
  for (int i = 10; i < s.length(); i++)
  {
   key = key >> 2;
   key = key | (char_map[s[i]] << 18);
   g_hash_map[key]++;
  }
  for (int i = 0; i < 1048576; i++)
  {
   if (g_hash_map[i]>1)
   {
    result.push_back(change_int_to_DNA(i));
   }
  }
  return result;
 }
};
int main()
{
 std::string s = "AAAAACCCCCAAAAACCCCCAAAAAGGGTTT";
 Solution solve;
 std::vector<std::string> result = solve.findRepeatedDnaSequences(s);
 for (int i = 0; i < result.size(); i++)
 {
  printf("%s\n", result[i].c_str());
 }
 return 0;
}

运行结果为：

CCCCCAAAAA
ACCCCCAAAA
AACCCCCAAA
AAACCCCCAA
AAAACCCCCA
AAAAACCCCC

Gianna K

发布了82 篇原创文章 · 获赞 65 · 访问量 3351

私信关注

猜你喜欢