netfilter的规则处理

本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: [email protected]
来源:http://yfydz.cublog.cn
1. 前言
 
netfilter中的防火墙规则是通过用户层的iptables命令来进行编辑的。而规则都是从属于某个表的(见我以前关于 netfilter新表的文章)。一般在mangle表对数据进行修改,在nat表对数据进行NAT,在filter表进行过滤。所不同的是NAT表中的规则只对新包(NEW/RELATED)进行处理,而MANGLE和FILTER表中的规则对所有数据包都处理。
 
以下Linux内核代码版本为2.4.26。

2. 数据结构
每条规则是用结构struct ipt_entry来定义的:
/* include/linux/netfilter_ipv4/ip_tables.h */
struct ipt_entry
{
 struct ipt_ip ip;
 /* Mark with fields that we care about. */
 unsigned int nfcache;
 /* Size of ipt_entry + matches */
 u_int16_t target_offset;
 /* Size of ipt_entry + matches + target */
 u_int16_t next_offset;
 /* Back pointer */
 unsigned int comefrom;
 /* Packet and byte counters. */
 struct ipt_counters counters;
 /* The matches (if any), then the target. */
 unsigned char elems[0];
};
参数说明:
struct ipt_ip ip:基本匹配项,包括协议、源地址/掩码、目的地址/掩码、进入网卡、出网卡等
unsigned int nfcache:标志项
u_int16_t target_offset:规则动作的偏移位置
u_int16_t next_offset:下一个规则的偏移位置
unsigned int comefrom:规则返回点
struct ipt_counters counters:计数器
unsigned char elems[0]:规则匹配项表,最后是动作项

ipt_ip结构:
struct ipt_ip {
 /* Source and destination IP addr */
 struct in_addr src, dst;
 /* Mask for src and dest IP addr */
 struct in_addr smsk, dmsk;
 char iniface[IFNAMSIZ], outiface[IFNAMSIZ];
 unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ];
 /* Protocol, 0 = ANY */
 u_int16_t proto;
 /* Flags word */
 u_int8_t flags;
 /* Inverse flags */
 u_int8_t invflags;
};
规则中的匹配项结构,注意这不是描述匹配的结构struct ipt_match
struct ipt_entry_match
{
 union {
// 这是用户空间(iptables)用到的部分,只提供名称即可
  struct {
   u_int16_t match_size;
   /* Used by userspace */
   char name[IPT_FUNCTION_MAXNAMELEN];
  } user;
// 这是内核空间用到的部分,指向具体的匹配模块
  struct {
   u_int16_t match_size;
   /* Used inside the kernel */
   struct ipt_match *match;
  } kernel;
  /* Total length */
  u_int16_t match_size;
 } u;
 unsigned char data[0];
};
规则中的目标(规则动作)项结构,注意这不是描述目标的结构struct ipt_target
struct ipt_entry_target
{
 union {
// 这是用户空间(iptables)用到的部分,只提供名称即可
  struct {
   u_int16_t target_size;
   /* Used by userspace */
   char name[IPT_FUNCTION_MAXNAMELEN];
  } user;
// 这是内核空间用到的部分,指向具体的目标模块
  struct {
   u_int16_t target_size;
   /* Used inside the kernel */
   struct ipt_target *target;
  } kernel;
  /* Total length */
  u_int16_t target_size;
 } u;
 unsigned char data[0];
};
3. 规则集操作函数
netfilter处理规则处理基本函数为ipt_do_table(),在filter/mangle表最终都要进入该函数,而nat表只对 NEW/RELATED的包进入该函数。该函数遍历所定义的规则集,顺次进行匹配,一旦和规则的条件匹配成功,则按规则指定的动作返回,返回值可能为 NF_ACCEPT/NF_DROP/NF_QUEUE/NF_STOLEN等。

/* net/ipv4/netfilter/ip_tables.c */
unsigned int
ipt_do_table(struct sk_buff **pskb,
      unsigned int hook,
      const struct net_device *in,
      const struct net_device *out,
      struct ipt_table *table,
      void *userdata)
{
 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))) = { 0 };
 u_int16_t offset;
 struct iphdr *ip;
 void *protohdr;
 u_int16_t datalen;
 int hotdrop = 0;
 /* Initializing verdict to NF_DROP keeps gcc happy. */
 unsigned int verdict = NF_DROP;
 const char *indev, *outdev;
 void *table_base;
 struct ipt_entry *e, *back;
 /* Initialization */
 ip = (*pskb)->nh.iph;
 protohdr = (u_int32_t *)ip + ip->ihl;
 datalen = (*pskb)->len - ip->ihl * 4;
// 如果数据包的进入网卡或出网卡为NULL,则在规则匹配时用nulldevname代替
 indev = in ? in->name : nulldevname;
 outdev = out ? out->name : nulldevname;
 /* We handle fragments by dealing with the first fragment as
  * if it was a normal packet.  All other fragments are treated
  * normally, except that they will NEVER match rules that ask
  * things we don't know, ie. tcp syn flag or ports).  If the
  * rule is also a fragment-specific rule, non-fragments won't
  * match it. */
 offset = ntohs(ip->frag_off) & IP_OFFSET;
 read_lock_bh(&table->lock);
 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
// 找到规则集起点,每个表可在不同的挂接点定义规则集,但所有规则集都是统一
// 在一个数值里的
 table_base = (void *)table->private->entries
  + TABLE_OFFSET(table->private,
          cpu_number_map(smp_processor_id()));
// 第一个规则
 e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
 /* Check noone else using our table */
 if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
     && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
  printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
         smp_processor_id(),
         table->name,
         &((struct ipt_entry *)table_base)->comefrom,
         ((struct ipt_entry *)table_base)->comefrom);
 }
 ((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
#endif
// 规则集的最后一条规则,最后一条规则是链的缺省动作,不是全接收就是全部拒绝
 /* For return from builtin chain */
 back = get_entry(table_base, table->private->underflow[hook]);
// 这是个死循环,因为最后一条规则是链的缺省动作,不是全接收就是全部拒绝
// 是能够跳出的,除非发生意外
 do {
  IP_NF_ASSERT(e);
  IP_NF_ASSERT(back);
  (*pskb)->nfcache |= e->nfcache;
// 进行基本元素(struct ipt_ip中定义的元素)的匹配,符合再进行后续匹配
  if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
   struct ipt_entry_target *t;
// 循环匹配规则中独立的匹配条件
   if (IPT_MATCH_ITERATE(e, do_match,
           *pskb, in, out,
           offset, protohdr,
           datalen, &hotdrop) != 0)
    goto no_match;
// 全部条件匹配,计数器增加
   ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
// 获取规则目标
   t = ipt_get_target(e);
   IP_NF_ASSERT(t->u.kernel.target);
   /* Standard target? */
   if (!t->u.kernel.target->target) {
    int v;
// 标准目标,正常情况v值是小于0的,如ACCEPT实际对于-NF_ACCEPT-1,
// DROP对应-NF_DROP-1,都是小于0的数
    v = ((struct ipt_standard_target *)t)->verdict;
    if (v < 0) {
     /* Pop from stack? */
     if (v != IPT_RETURN) {
// verdict重新计算回正常值
      verdict = (unsigned)(-v) - 1;
      break;
     }
// 对于IPT_RETURN,返回原来的链重新继续循环
     e = back;
     back = get_entry(table_base,
        back->comefrom);
     continue;
    }
    if (table_base + v
        != (void *)e + e->next_offset) {
     /* Save old back ptr in next entry */
     struct ipt_entry *next
      = (void *)e + e->next_offset;
     next->comefrom
      = (void *)back - table_base;
     /* set back pointer to next entry */
     back = next;
    }
    e = get_entry(table_base, v);
   } else {
// 规则目标非标准目标,而是单独定义的目标模块
    /* Targets which reenter must return
                                   abs. verdicts */
#ifdef CONFIG_NETFILTER_DEBUG
    ((struct ipt_entry *)table_base)->comefrom
     = 0xeeeeeeec;
#endif
// 调用目标模块的target()函数
    verdict = t->u.kernel.target->target(pskb,
             hook,
             in, out,
             t->data,
             userdata);
#ifdef CONFIG_NETFILTER_DEBUG
    if (((struct ipt_entry *)table_base)->comefrom
        != 0xeeeeeeec
        && verdict == IPT_CONTINUE) {
     printk("Target %s reentered!\n",
            t->u.kernel.target->name);
     verdict = NF_DROP;
    }
    ((struct ipt_entry *)table_base)->comefrom
     = 0x57acc001;
#endif
// 目标有可能修改数据包的各种信息,数据包本身也可能不再是原来的包而是拷贝
// 后的包,因此关于包的网络参数需要重新识别
    /* Target might have changed stuff. */
    ip = (*pskb)->nh.iph;
    protohdr = (u_int32_t *)ip + ip->ihl;
    datalen = (*pskb)->len - ip->ihl * 4;
    if (verdict == IPT_CONTINUE)
// 返回IPT_CONTINUE时继续下一条规则的检查
// 注意不支持IPT_RETURN
     e = (void *)e + e->next_offset;
    else
     /* Verdict */
     break;
   }
  } else {
// 规则不匹配,找下一条规则继续
  no_match:
   e = (void *)e + e->next_offset;
  }
// 匹配模块中有hotdrop参数,允许匹配模块丢包,而通常匹配模块是不丢包的
 } while (!hotdrop);
#ifdef CONFIG_NETFILTER_DEBUG
 ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
#endif
 read_unlock_bh(&table->lock);
#ifdef DEBUG_ALLOW_ALL
 return NF_ACCEPT;
#else
 if (hotdrop)
  return NF_DROP;
 else return verdict;
#endif
}

4. 规则的修改
netfilter本质上是以数组方法保存规则集的,虽然每条规则的大小可能是不同的,因此在编辑规则时实际上操作比较麻烦的,对于 iptables的各种编辑规则的命令,实际上都是替换操作:IPT_SO_SET_REPLACE,对应的处理函数为do_replace()。

/* net/ipv4/netfilter/ip_tables.c */

static int
do_replace(void *user, unsigned int len)
{
 int ret;
 struct ipt_replace tmp;
 struct ipt_table *t;
 struct ipt_table_info *newinfo, *oldinfo;
 struct ipt_counters *counters;
// 先从用户空间拷贝规则集的描述信息,由结构struct ipt_replace描述
 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
  return -EFAULT;
// 长度检查
 /* Hack: Causes ipchains to give correct error msg --RR */
 if (len != sizeof(tmp) + tmp.size)
  return -ENOPROTOOPT;
 /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
  return -ENOMEM;
// 分配实际的规则集内存空间,每个CPU一个
 newinfo = vmalloc(sizeof(struct ipt_table_info)
     + SMP_ALIGN(tmp.size) * smp_num_cpus);
 if (!newinfo)
  return -ENOMEM;
 if (copy_from_user(newinfo->entries, user + sizeof(tmp),
      tmp.size) != 0) {
  ret = -EFAULT;
  goto free_newinfo;
 }
// 分配老规则集的计数器空间准备返回给用户空间
 counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
 if (!counters) {
  ret = -ENOMEM;
  goto free_newinfo;
 }
 memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
// 转换规则,检查规则的合法性等操作
 ret = translate_table(tmp.name, tmp.valid_hooks,
         newinfo, tmp.size, tmp.num_entries,
         tmp.hook_entry, tmp.underflow);
 if (ret != 0)
  goto free_newinfo_counters;
 duprintf("ip_tables: Translated table\n");
// 找到相应的ipt_table表
 t = find_table_lock(tmp.name, &ret, &ipt_mutex);
 if (!t)
  goto free_newinfo_counters_untrans;
 /* You lied! */
 if (tmp.valid_hooks != t->valid_hooks) {
  duprintf("Valid hook crap: %08X vs %08X\n",
    tmp.valid_hooks, t->valid_hooks);
  ret = -EINVAL;
  goto free_newinfo_counters_untrans_unlock;
 }
// 将新的规则集替换原来的规则集
 oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
 if (!oldinfo)
  goto free_newinfo_counters_untrans_unlock;
 /* Update module usage count based on number of rules */
 duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
  oldinfo->number, oldinfo->initial_entries, newinfo->number);
 if (t->me && (oldinfo->number <= oldinfo->initial_entries) &&
      (newinfo->number > oldinfo->initial_entries))
  __MOD_INC_USE_COUNT(t->me);
 else if (t->me && (oldinfo->number > oldinfo->initial_entries) &&
    (newinfo->number <= oldinfo->initial_entries))
  __MOD_DEC_USE_COUNT(t->me);
 /* Get the old counters. */
// 读取老规则集的计数器
 get_counters(oldinfo, counters);
 /* Decrease module usage counts and free resource */
// 遍历清除老规则集,调用规则中匹配和目标模块的destroy()函数
 IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
// 释放老规则集
 vfree(oldinfo);
 /* Silent error: too late now. */
// 将计数器拷贝回用户空间
 copy_to_user(tmp.counters, counters,
       sizeof(struct ipt_counters) * tmp.num_counters);
// 将老计数器释放
 vfree(counters);
 up(&ipt_mutex);
 return 0;
 free_newinfo_counters_untrans_unlock:
 up(&ipt_mutex);
 free_newinfo_counters_untrans:
 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
 free_newinfo_counters:
 vfree(counters);
 free_newinfo:
 vfree(newinfo);
 return ret;
}

处理过程中比较重要的连接函数为translate_table()和replace_table(),也都在ip_tables.c中定义:

static int
translate_table(const char *name,
  unsigned int valid_hooks,
  struct ipt_table_info *newinfo,
  unsigned int size,
  unsigned int number,
  const unsigned int *hook_entries,
  const unsigned int *underflows)
{
 unsigned int i;
 int ret;
 newinfo->size = size;
 newinfo->number = number;
 /* Init all hooks to impossible value. */
 for (i = 0; i < NF_IP_NUMHOOKS; i++) {
  newinfo->hook_entry[i] = 0xFFFFFFFF;
  newinfo->underflow[i] = 0xFFFFFFFF;
 }
 duprintf("translate_table: size %u\n", newinfo->size);
 i = 0;
 /* Walk through entries, checking offsets. */
// 检查规则集中规则的合法性,检查偏移是否正确
 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
    check_entry_size_and_hooks,
    newinfo,
    newinfo->entries,
    newinfo->entries + size,
    hook_entries, underflows, &i);
 if (ret != 0)
  return ret;
 if (i != number) {
  duprintf("translate_table: %u not %u entries\n",
    i, number);
  return -EINVAL;
 }
 /* Check hooks all assigned */
 for (i = 0; i < NF_IP_NUMHOOKS; i++) {
  /* Only hooks which are valid */
  if (!(valid_hooks & (1 << i)))
   continue;
// 检查是否在合法hook点没有设置规则,在每个合法hook点是必须有规则的
  if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
   duprintf("Invalid hook entry %u %u\n",
     i, hook_entries[i]);
   return -EINVAL;
  }
  if (newinfo->underflow[i] == 0xFFFFFFFF) {
   duprintf("Invalid underflow %u %u\n",
     i, underflows[i]);
   return -EINVAL;
  }
 }
// 检查自定义的链是否形成环
 if (!mark_source_chains(newinfo, valid_hooks))
  return -ELOOP;
 /* Finally, each sanity check must pass */
 i = 0;
// 遍历规则,通过调用匹配和目标的checkentry()函数检查其合法性
 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
    check_entry, name, size, &i);
 if (ret != 0) {
  IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
      cleanup_entry, &i);
  return ret;
 }
// 规则集是每个CPU都有一个
 /* And one copy for every other CPU */
 for (i = 1; i < smp_num_cpus; i++) {
  memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
         newinfo->entries,
         SMP_ALIGN(newinfo->size));
 }
 return ret;
}
 
static struct ipt_table_info *
replace_table(struct ipt_table *table,
       unsigned int num_counters,
       struct ipt_table_info *newinfo,
       int *error)
{
 struct ipt_table_info *oldinfo;
#ifdef CONFIG_NETFILTER_DEBUG
 {
  struct ipt_entry *table_base;
  unsigned int i;
  for (i = 0; i < smp_num_cpus; i++) {
   table_base =
    (void *)newinfo->entries
    + TABLE_OFFSET(newinfo, i);
   table_base->comefrom = 0xdead57ac;
  }
 }
#endif
 /* Do the substitution. */
 write_lock_bh(&table->lock);
 /* Check inside lock: is the old number correct? */
 if (num_counters != table->private->number) {
  duprintf("num_counters != table->private->number (%u/%u)\n",
    num_counters, table->private->number);
  write_unlock_bh(&table->lock);
  *error = -EAGAIN;
  return NULL;
 }
// struct ipt_table结构中的private指向规则集
// 获取老规则集地址指针
 oldinfo = table->private;
// 指向新规则集
 table->private = newinfo;
 newinfo->initial_entries = oldinfo->initial_entries;
 write_unlock_bh(&table->lock);
 return oldinfo;
}
 
5. 结论
netfilter的规则是数组方式顺序保存,但每个元素(规则)的大小是不同的,每条规则除了基本部分相同外,还包括不同数量的匹配和目标项。规则匹配是顺序匹配,而编辑时实际上是将整个规则集全部替换。

猜你喜欢

转载自cxw06023273.iteye.com/blog/866903
今日推荐