問題引入
dpvs是dpdk程式,特點就是每個核盡可能不與其他核互動,這就要求共享資料都有一份拷貝,或是資料私有。fnat模式中流表(session)儲存連接配接資訊,每個核獨有。但這裡有個問題,full-nat模式下,返程資料outbound packet也必須配置設定到與inbound packet收取的同一個lcore,否則在流表中找不到conn
dpvs解決方案
引入fdir機制
lcore上lip都是同步一緻的,根據本地端口lport,來配置設定正确的核,此處用于fdir計算的lport根據啟用的lcore數量做掩碼(即dst_port_mask)
配置檔案fdir
先看一下配置檔案
<init> device dpdk0 { rx { queue_number 8 descriptor_number 1024 rss all } tx { queue_number 8 descriptor_number 1024 } fdir { mode perfect pballoc 64k status matched } ! promisc_mode kni_name dpdk0.kni}
可以看到,rx隊列配置了rss,并且dpdk0網卡配置了fdir
預設fdir配置
在對網卡初始化時,會用到default_port_conf,這裡有關于網卡fdir預設配置
static struct rte_eth_conf default_port_conf ={ .rxmode = { .mq_mode = ETH_MQ_RX_RSS, .max_rx_pkt_len = ETHER_MAX_LEN, .split_hdr_size = 0, .offloads = DEV_RX_OFFLOAD_IPV4_CKSUM, }, .rx_adv_conf = { .rss_conf = { .rss_key = NULL, .rss_hf = /*ETH_RSS_IP*/ ETH_RSS_TCP, }, }, .txmode = { .mq_mode = ETH_MQ_TX_NONE, }, .fdir_conf = { .mode = RTE_FDIR_MODE_PERFECT, .pballoc = RTE_FDIR_PBALLOC_64K, .status = RTE_FDIR_REPORT_STATUS /*_ALWAYS*/, .mask = { .vlan_tci_mask = 0x0, .ipv4_mask = { .src_ip = 0x00000000, .dst_ip = 0xFFFFFFFF, }, .ipv6_mask = { .src_ip = { 0, 0, 0, 0 }, .dst_ip = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }, .src_port_mask = 0x0000, /* to be changed according to slave lcore number in use */ .dst_port_mask = 0x00F8, .mac_addr_byte_mask = 0x00, .tunnel_type_mask = 0, .tunnel_id_mask = 0, }, .drop_queue = 127, .flex_conf = { .nb_payloads = 0, .nb_flexmasks = 0, }, },};
rx_adv_conf關于RSS配置,預設是ETH_RSS_TCP
最重要的是fdir_conf,可以看到mode,pballoc,status等。這裡關注mask即可,fdir支援不同層的導流,ipv4_mask.src_ip掩碼是0,ipv4_mask.dst_ip位全置1,是以fdir隻看目的地ip不看源ip,src_port_mask是0,dst_port_mask非0,也就是說dpvs fdir隻根據dst_ip,dst_port_mask計算,也就是對應<lip,lport>,由于lip隻有一個,是以等同于隻看lport,那麼如何如何設定dst_port_mask掩碼呢?
dst_port_mask設定參見下面netif_start_port流程
sa_pool與fdir初始化
每個lcore都有自己的sa_pool,用于管理本地配置設定的<lip,lport>,假如目前啟用了64個lcore,一共有65535-1024可用端口,那麼每個lcore在同一個lip上最多使用(65535-1024)/64個位址
相關結構體
enum{ SA_F_USED = 0x01,}; /** * if really need to to save memory, we can; * 1. use hlist_head * 2. use uint8_t flag * 3. remove sa_entry.addr, and get IP from sa_pool->ifa * 4. to __packed__ sa_entry. * 5. alloc sa_entries[] for 65536/cpu_num only. * 6. create sa_entry_pool only if pool_hash hit. * since when dest (like RS) num may small. */ /* socket address (sa) is <ip, port> pair. */struct sa_entry{ struct list_head list; /* node of sa_pool. */ //标志,目前隻有SA_F_USED,主要标記該端口是否被使用了(空閑/busy狀态) uint32_t flags; /* SA_F_XXX */ union inet_addr addr; __be16 port;}; struct sa_entry_pool{ //sa_entry hash表 struct sa_entry sa_entries[MAX_PORT]; //used sa_entry連結清單頭 struct list_head used_enties; //free sa_entry連結清單頭 struct list_head free_enties; /* another way is use total_used/free_cnt in sa_pool, * so that we need not travels the hash to get stats. * we use cnt here, since we may need per-pool stats. */ //統計計數 uint16_t used_cnt; uint16_t free_cnt; uint32_t miss_cnt;}; /* no lock needed because inet_ifaddr.sa_pool * is per-lcore. */struct sa_pool{ //sa_pool所屬的ip層位址塊 struct inet_ifaddr * ifa; /* back-pointer */ //low,high端口 uint16_t low; /* min port */ uint16_t high; /* max port */ //引用計數 rte_atomic32_t refcnt; /* hashed pools by dest's <ip/port>. if no dest provided, * just use first pool. it's not need create/destroy pool * for each dest, that'll be too complicated. */ struct sa_entry_pool *pool_hash; //hash表中桶個數 uint8_t pool_hash_sz; uint32_t flags; /* SA_POOL_F_XXX */ /* fdir filter ID */ uint32_t filter_id[MAX_FDIR_PROTO];}; struct sa_fdir{ /* the ports one lcore can use means * "(fdir.mask & port) == port_base" */ //掩碼 uint16_t mask; /* filter's port mask *///所在lcore id,每個lcore上配置設定一個 lcoreid_t lcore;//在選取本地lcore可用port時時,lport 經過掩碼後,得到的值如果等于 port_base 就會配置設定到這個核心 __be16 port_base; uint16_t soft_id; /* current unsed soft-id, * increase after use. */};
程式初始化時調用sa_pool_init初始化全局fdir表
static struct sa_fdir sa_fdirs[DPVS_MAX_LCORE];int sa_pool_init(void){ int shift; lcoreid_t cid; uint16_t port_base; /* enabled lcore should not change after init */ //擷取已啟用的lcore個數和掩碼 netif_get_slave_lcores(&sa_nlcore, &sa_lcore_mask); /* how many mask bits needed ? */ //計算log2(sas_nlcore),向上取整 for (shift = 0; (0x1 << shift) < sa_nlcore; shift++) { ; } if (shift >= 16) { return(EDPVS_INVAL); /* bad config */ } port_base = 0; for (cid = 0; cid < DPVS_MAX_LCORE; cid++) { //如果cid>=64或者該lcore沒有被啟用,跳過 if (cid >= 64 || !(sa_lcore_mask & (1L << cid))) { continue; } assert(rte_lcore_is_enabled(cid) && cid != rte_get_master_lcore()); //sa_fdirs是per-lcore資料結構,設定mask掩碼,主要是通過&操作代替取餘操作 sa_fdirs[cid].mask = ~((~0x0) << shift); sa_fdirs[cid].lcore = cid; //fdir計算時,lport 經過掩碼後,得到的值如果等于 port_base 就會配置設定到這個核心 sa_fdirs[cid].port_base = htons(port_base); sa_fdirs[cid].soft_id = 0; port_base++; } return(EDPVS_OK);}
在使用ipvsadm添加lip時,ifa_add_set調用sa_pool_create初始化sa_pool
int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high){ int err; struct sa_pool *ap; struct sa_fdir *fdir; uint32_t filtids[MAX_FDIR_PROTO]; lcoreid_t cid = rte_lcore_id(); //判斷目前lcore是否在sa_lcore_mask中 if (cid > 64 || !((sa_lcore_mask & (1UL << cid)))) { if (cid == rte_get_master_lcore()) { return(EDPVS_OK); /* no sapool on master */ } return(EDPVS_INVAL); } //low,high端口預設值分别是1025,65535 low = low ? : DEF_MIN_PORT; high = high ? : DEF_MAX_PORT; //參數校驗 if (!ifa || low > high || low == 0 || high >= MAX_PORT) { RTE_LOG(ERR, SAPOOL, "%s: bad arguments\\n", __func__); return(EDPVS_INVAL); } //通過lcore id擷取對應的sa_fdir對象 fdir = &sa_fdirs[cid]; //配置設定sa_pool對象 ap = rte_zmalloc(NULL, sizeof(struct sa_pool), 0); if (unlikely(!ap)) { return(EDPVS_NOMEM); } //設定sa_pool相關參數 ap->ifa = ifa; ap->low = low; ap->high = high; ap->flags = 0; rte_atomic32_set(&ap->refcnt, 1); //配置設定sa_pool中socket位址的哈希表,預設hash桶的數量為16 err = sa_pool_alloc_hash(ap, sa_pool_hash_size, fdir); if (err != EDPVS_OK) { goto free_ap; } filtids[0] = fdir->soft_id++; filtids[1] = fdir->soft_id++; //增加fdir filter,用于fdir比對 err = sa_add_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr, fdir->port_base, filtids); /* thread-safe ? */ if (err != EDPVS_OK) { goto free_hash; } ap->filter_id[0] = filtids[0]; ap->filter_id[1] = filtids[1]; ifa->sa_pool = ap; /* inc ifa->refcnt to hold it */ rte_atomic32_inc(&ifa->refcnt); #ifdef CONFIG_DPVS_SAPOOL_DEBUG { char addr[64]; RTE_LOG(INFO, SAPOOL, "[%02d] %s: sa pool created -- %s\\n", rte_lcore_id(), __func__, inet_ntop(ifa->af, &ifa->addr, addr, sizeof(addr)) ? : NULL); }#endif return(EDPVS_OK); free_hash: sa_pool_free_hash(ap);free_ap: rte_free(ap); return(err);}
sa_pool_alloc_hash
static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, const struct sa_fdir *fdir){ int hash; struct sa_entry_pool *pool; uint32_t port; /* should be u32 or 65535==0 */ //配置設定sa_pool->pool_hash桶配置設定 ap->pool_hash = rte_malloc(NULL, sizeof(struct sa_entry_pool) * hash_sz, RTE_CACHE_LINE_SIZE); if (!ap->pool_hash) { return(EDPVS_NOMEM); } //設定哈希表桶大小為hash_sz ap->pool_hash_sz = hash_sz; /* the big loop takes about 17ms */ for (hash = 0; hash < hash_sz; hash++) { pool = &ap->pool_hash[hash]; //初始化used_entry和free_entry連結清單頭 INIT_LIST_HEAD(&pool->used_enties); INIT_LIST_HEAD(&pool->free_enties); pool->used_cnt = 0; pool->free_cnt = 0; //根據fdir->mask &&((uint16_t)port & fdir->mask) == ntohs(fdir->port_base) 為目前lcore配置設定位址 for (port = ap->low; port <= ap->high; port++) { struct sa_entry *sa; if (fdir->mask && ((uint16_t)port & fdir->mask) != ntohs(fdir->port_base)) { continue; } sa = &pool->sa_entries[(uint16_t)port]; sa->addr = ap->ifa->addr; sa->port = htons((uint16_t)port); list_add_tail(&sa->list, &pool->free_enties); pool->free_cnt++; } } return(EDPVS_OK);}
sa_add_filter
static inline int sa_add_filter(int af, struct netif_port *dev, lcoreid_t cid, const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO]){ return(__add_del_filter(af, dev, cid, dip, dport, filter_id, true));}__add_del_filter
主要對(目的ip,目的port),更确切講是dst_port&fdir_conf.mask.dst_port_mask後的端口進行flow director設定
fdir_conf.mask.dst_port_mask的設定詳見netif_start中調用netif_port_fdir_dstport_mask_set
static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid, const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO], bool add){ queueid_t queue; int err; enum rte_filter_op op, rop; //flow Director設定,action為接收資料 struct rte_eth_fdir_filter filt[MAX_FDIR_PROTO] = { { .action.behavior = RTE_ETH_FDIR_ACCEPT, .action.report_status = RTE_ETH_FDIR_REPORT_ID, .soft_id = filter_id[0], }, { .action.behavior = RTE_ETH_FDIR_ACCEPT, .action.report_status = RTE_ETH_FDIR_REPORT_ID, .soft_id = filter_id[1], }, }; if (af == AF_INET) { //IPv4 flow dirctor設定,隻對(目的ip,目的端口)進行過濾,,具體為做過mask.dst_port_mask掩碼後的dport //當dport為0,lcore_count=4,則0,4,8,12......的目的端口會進入該lcore filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP; filt[0].input.flow.tcp4_flow.ip.dst_ip = dip->in.s_addr; filt[0].input.flow.tcp4_flow.dst_port = dport; filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP; filt[1].input.flow.udp4_flow.ip.dst_ip = dip->in.s_addr; filt[1].input.flow.udp4_flow.dst_port = dport; } else if (af == AF_INET6) { filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_TCP; memcpy(filt[0].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr)); filt[0].input.flow.tcp6_flow.dst_port = dport; filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_UDP; memcpy(filt[1].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr)); filt[1].input.flow.udp6_flow.dst_port = dport; } else { return(EDPVS_NOTSUPP); } //判斷裝置是否flow director功能,主要對dpdk裝置和bond裝置,内部調用rte_eth_dev_filter_supported if (dev->netif_ops && dev->netif_ops->op_filter_supported) { if (dev->netif_ops->op_filter_supported(dev, RTE_ETH_FILTER_FDIR) < 0) { if (dev->nrxq <= 1) { return(EDPVS_OK); } RTE_LOG(ERR, SAPOOL, "%s: FDIR is not supported by device %s. Only" " single rxq can be configured.\\n", __func__, dev->name); return(EDPVS_NOTSUPP); } } else { RTE_LOG(ERR, SAPOOL, "%s: FDIR support of device %s is not known.\\n", __func__, dev->name); return(EDPVS_INVAL); } //擷取lcore上處理port的哪些接收隊列 err = netif_get_queue(dev, cid, &queue); if (err != EDPVS_OK) { return(err); } //設定flow Director的接收隊列為queue,綁定網卡硬體隊列 filt[0].action.rx_queue = filt[1].action.rx_queue = queue; op = add ? RTE_ETH_FILTER_ADD : RTE_ETH_FILTER_DELETE; //将過濾條件更新到網卡 netif_mask_fdir_filter(af, dev, &filt[0]); netif_mask_fdir_filter(af, dev, &filt[1]); //内部調用rte_eth_dev_filter_ctrl添加flow filter規則 err = netif_fdir_filter_set(dev, op, &filt[0]); if (err != EDPVS_OK) { return(err); } err = netif_fdir_filter_set(dev, op, &filt[1]); if (err != EDPVS_OK) { rop = add ? RTE_ETH_FILTER_DELETE : RTE_ETH_FILTER_ADD; netif_fdir_filter_set(dev, rop, &filt[0]); return(err); } #ifdef CONFIG_DPVS_SAPOOL_DEBUG { char ipaddr[64]; RTE_LOG(DEBUG, SAPOOL, "FDIR: %s %s %s TCP/UDP " "ip %s port %d (0x%04x) mask 0x%04X queue %d lcore %2d filterID %d/%d\\n", add ? "add" : "del", dev->name, af == AF_INET ? "IPv4" : "IPv6", inet_ntop(af, dip, ipaddr, sizeof(ipaddr)) ? : "::", ntohs(dport), ntohs(dport), sa_fdirs[cid].mask, queue, cid, filter_id[0], filter_id[1]); }#endif return(err);}
netif_mask_fdir_filter
對目前設定做與操作,屏蔽之前設定的flow director資訊
void netif_mask_fdir_filter(int af, const struct netif_port *port, struct rte_eth_fdir_filter *filt){ struct rte_eth_fdir_info fdir_info; const struct rte_eth_fdir_masks *fmask; //首先擷取fdir_filter中的input flow定義 union rte_eth_fdir_flow * flow = &filt->input.flow; /** union rte_eth_fdir_flow { struct rte_eth_l2_flow l2_flow; struct rte_eth_udpv4_flow udp4_flow; struct rte_eth_tcpv4_flow tcp4_flow; struct rte_eth_sctpv4_flow sctp4_flow; struct rte_eth_ipv4_flow ip4_flow; struct rte_eth_udpv6_flow udp6_flow; struct rte_eth_tcpv6_flow tcp6_flow; struct rte_eth_sctpv6_flow sctp6_flow; struct rte_eth_ipv6_flow ipv6_flow; struct rte_eth_mac_vlan_flow mac_vlan_flow; struct rte_eth_tunnel_flow tunnel_flow; }; */ /* There exists a defect here. If the netif_port 'port' is not PORT_TYPE_GENERAL, * mask fdir_filter of the port would fail. The correct way to accomplish the * function is to register this method for all device types. Considering the flow * is not changed after masking, we just skip netif_ports other than physical ones. */ if (port->type != PORT_TYPE_GENERAL) { return; } //retrieve fdir information if (rte_eth_dev_filter_ctrl(port->id, RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_INFO, &fdir_info) < 0) { RTE_LOG(DEBUG, NETIF, "%s: Fail to fetch fdir info of %s !\\n", __func__, port->name); return; } fmask = &fdir_info.mask; /* ipv4 flow */ if (af == AF_INET) { flow->ip4_flow.src_ip &= fmask->ipv4_mask.src_ip; flow->ip4_flow.dst_ip &= fmask->ipv4_mask.dst_ip; flow->ip4_flow.tos &= fmask->ipv4_mask.tos; flow->ip4_flow.ttl &= fmask->ipv4_mask.ttl; flow->ip4_flow.proto &= fmask->ipv4_mask.proto; flow->tcp4_flow.src_port &= fmask->src_port_mask; flow->tcp4_flow.dst_port &= fmask->dst_port_mask; return; } /* ipv6 flow */ if (af == AF_INET6) { flow->ipv6_flow.src_ip[0] &= fmask->ipv6_mask.src_ip[0]; flow->ipv6_flow.src_ip[1] &= fmask->ipv6_mask.src_ip[1]; flow->ipv6_flow.src_ip[2] &= fmask->ipv6_mask.src_ip[2]; flow->ipv6_flow.src_ip[3] &= fmask->ipv6_mask.src_ip[3]; flow->ipv6_flow.dst_ip[0] &= fmask->ipv6_mask.dst_ip[0]; flow->ipv6_flow.dst_ip[1] &= fmask->ipv6_mask.dst_ip[1]; flow->ipv6_flow.dst_ip[2] &= fmask->ipv6_mask.dst_ip[2]; flow->ipv6_flow.dst_ip[3] &= fmask->ipv6_mask.dst_ip[3]; flow->ipv6_flow.tc &= fmask->ipv6_mask.tc; flow->ipv6_flow.proto &= fmask->ipv6_mask.proto; flow->ipv6_flow.hop_limits &= fmask->ipv6_mask.hop_limits; flow->tcp6_flow.src_port &= fmask->src_port_mask; flow->tcp6_flow.dst_port &= fmask->dst_port_mask; return; }}
netif_port_fdir_dstport_mask_set
在netif_port_start啟動網卡時調用
/* * fdir mask must be set according to configured slave lcore number * */inline static int netif_port_fdir_dstport_mask_set(struct netif_port *port){ uint8_t slave_nb; int shift; netif_get_slave_lcores(&slave_nb, NULL); for (shift = 0; (0x1 << shift) < slave_nb; shift++) { ; } if (shift >= 16) { RTE_LOG(ERR, NETIF, "%s: %s's fdir dst_port_mask init failed\\n", __func__, port->name); return(EDPVS_NOTSUPP); }#if RTE_VERSION >= 0x10040010 port->dev_conf.fdir_conf.mask.dst_port_mask = htons(~((~0x0) << shift));#else port->dev_conf.fdir_conf.mask.dst_port_mask = ~((~0x0) << shift);#endif RTE_LOG(INFO, NETIF, "%s:dst_port_mask=%0x\\n", port->name, port->dev_conf.fdir_conf.mask.dst_port_mask); return(EDPVS_OK);}
conn流表中使用fdir
上文fdir設定主要是用于fnat中outbound方向資料流量回到dpvs時,與inbound流量進入dpvs所在lcore保持一緻,避免流表查找的鎖和緩存失效等。
主要看下dpvs中如何從sa_pool中配置設定本地位址和本地端口,對于所有建立連接配接,都會調用dp_vs_conn_new注冊流表
dp_vs_conn_new
隻有 full-nat 才支援 local address/portstruct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, const struct dp_vs_iphdr *iph, struct dp_vs_conn_param *param, struct dp_vs_dest *dest, uint32_t flags){ //full-nat 特殊處理 if (dest->fwdmode == DPVS_FWD_MODE_FNAT) { //綁定LB本地socket,fullnat做了雙向nat if ((err = dp_vs_laddr_bind(new, dest->svc)) != EDPVS_OK) { goto unbind_dest; } }}
dp_vs_laddr_bind
int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc){ struct dp_vs_laddr *laddr = NULL; int i; uint16_t sport = 0; struct sockaddr_storage dsin, ssin; bool found = false; //一些正常校驗 if (!conn || !conn->dest || !svc) { return(EDPVS_INVAL); } //如果傳輸層協定不是TCP或者UDP也直接傳回錯誤,因為在設定fdir時隻關注了tcp和udp協定 if (svc->proto != IPPROTO_TCP && svc->proto != IPPROTO_UDP) { return(EDPVS_NOTSUPP); } if (dp_vs_conn_is_template(conn)) { return(EDPVS_OK); } /* * some time allocate lport fails for one laddr, * but there's also some resource on another laddr. */ for (i = 0; i < dp_vs_laddr_max_trails && i < svc->num_laddrs; i++) { /* select a local IP from service */ //首先選擇一個laddr laddr = __get_laddr(svc); if (!laddr) { RTE_LOG(ERR, IPVS, "%s: no laddr available.\\n", __func__); return(EDPVS_RESOURCE); } //首先将dsin與ssin清零 memset(&dsin, 0, sizeof(struct sockaddr_storage)); memset(&ssin, 0, sizeof(struct sockaddr_storage)); if (laddr->af == AF_INET) { //fnat要将目的資訊修改為conn選擇的RS的ip:port,同時将源ip和port修改為local ip相關,因為要保證outbound方向資料 //同樣傳回值dpvs struct sockaddr_in *daddr, *saddr; daddr = (struct sockaddr_in *)&dsin; daddr->sin_family = laddr->af; daddr->sin_addr = conn->daddr.in; daddr->sin_port = conn->dport; saddr = (struct sockaddr_in *)&ssin; saddr->sin_family = laddr->af; saddr->sin_addr = laddr->addr.in; } else { struct sockaddr_in6 *daddr, *saddr; daddr = (struct sockaddr_in6 *)&dsin; daddr->sin6_family = laddr->af; daddr->sin6_addr = conn->daddr.in6; daddr->sin6_port = conn->dport; saddr = (struct sockaddr_in6 *)&ssin; saddr->sin6_family = laddr->af; saddr->sin6_addr = laddr->addr.in6; } //sa_fetch擷取端口,來填充完整的dsin,ssin位址,選取port主要用于通過fdir後outbound方向資料傳回目前lcore處理 if (sa_fetch(laddr->af, laddr->iface, &dsin, &ssin) != EDPVS_OK) { char buf[64]; if (inet_ntop(laddr->af, &laddr->addr, buf, sizeof(buf)) == NULL) { snprintf(buf, sizeof(buf), "::"); } #ifdef CONFIG_DPVS_IPVS_DEBUG RTE_LOG(DEBUG, IPVS, "%s: [%02d] no lport available on %s, " "try next laddr.\\n", __func__, rte_lcore_id(), buf);#endif put_laddr(laddr); continue; } //sport為選取出來的port sport = (laddr->af == AF_INET ? (((struct sockaddr_in *)&ssin)->sin_port) : (((struct sockaddr_in6 *)&ssin)->sin6_port)); found = true; break; } //如果選取laddr和lport失敗,傳回錯誤 if (!found) {#ifdef CONFIG_DPVS_IPVS_DEBUG RTE_LOG(ERR, IPVS, "%s: [%02d] no lip/lport available !!\\n", __func__, rte_lcore_id());#endif return(EDPVS_RESOURCE); } rte_atomic32_inc(&laddr->conn_counts); /* overwrite related fields in out-tuplehash and conn */ //設定conn的local address和local port conn->laddr = laddr->addr; conn->lport = sport; //設定outbound方向tuplehash_entry的連接配接位址資訊 tuplehash_out(conn).daddr = laddr->addr; tuplehash_out(conn).dport = sport; //關聯選取的laddr conn->local = laddr; return(EDPVS_OK);}
__get_laddr
static inline struct dp_vs_laddr *__get_laddr(struct dp_vs_service *svc){ int step; struct dp_vs_laddr *laddr = NULL; /* if list not inited ? list_empty() returns true ! */ assert(svc->laddr_list.next); //如果沒有laddr,直接傳回 if (list_empty(&svc->laddr_list)) { return(NULL); } //随記産生一個步數 step = __laddr_step(svc); while (step-- > 0) { if (unlikely(!svc->laddr_curr)) { svc->laddr_curr = svc->laddr_list.next; } else { svc->laddr_curr = svc->laddr_curr->next; } //循環連結清單,過濾連結清單頭 if (svc->laddr_curr == &svc->laddr_list) { svc->laddr_curr = svc->laddr_list.next; } } //擷取struct dp_vs_laddr laddr = list_entry(svc->laddr_curr, struct dp_vs_laddr, list); rte_atomic32_inc(&laddr->refcnt); return(laddr);}
__laddr_step
static inline int __laddr_step(struct dp_vs_service *svc){ /* Why can't we always use the next laddr(rr scheduler) to setup new session? * Because realserver rr/wrr scheduler may get synchronous with the laddr rr * scheduler. If so, the local IP may stay invariant for a specified realserver, * which is a hurt for realserver concurrency performance. To avoid the problem, * we just choose 5% sessions to use the one after the next laddr randomly. * */ if (strncmp(svc->scheduler->name, "rr", 2) == 0 || strncmp(svc->scheduler->name, "wrr", 3) == 0) { return((random() % 100) < 5 ? 2 : 1); } return(1);}
sa_fetch
包裹函數
int sa_fetch(int af, struct netif_port *dev, const struct sockaddr_storage *daddr, struct sockaddr_storage *saddr){ if (unlikely(daddr && daddr->ss_family != af)) { return(EDPVS_INVAL); } if (unlikely(saddr && saddr->ss_family != af)) { return(EDPVS_INVAL); } if (AF_INET == af) { return(sa4_fetch(dev, (const struct sockaddr_in *)daddr, (struct sockaddr_in *)saddr)); } else if (AF_INET6 == af) { return(sa6_fetch(dev, (const struct sockaddr_in6 *)daddr, (struct sockaddr_in6 *)saddr)); } else { return(EDPVS_NOTSUPP); }}
sa4_fetch
關注下ipv4 lport的選取
查找未使用的<saddr,sport>
/* * fetch unused <saddr, sport> pair by given hint. * given @ap equivalent to @dev+@saddr, and dport is useless. * with routing's help, the mapping looks like, * * +------+------------+-------+------------------- * | | ap | | Is possible to * |daddr | dev & saddr| sport | fetch addr pair? * +------+------------+-------+------------------- * Y Y ? Y Possible * Y Y Y ? Possible * Y Y ? ? Possible * Y N ? Y Possible * Y N Y ? Possible * Y N ? ? Possible * N Y ? Y Possible * N Y Y ? Possible * N Y ? ? Possible * N N ? Y Not Possible * N N Y ? Possible * N N ? ? Not Possible * * daddr is a hint to found dev/saddr (by route/netif module). * dev is also a hint, the saddr(ifa) is the key. * af is needed when both saddr and daddr are NULL. */static int sa4_fetch(struct netif_port *dev, const struct sockaddr_in *daddr, struct sockaddr_in *saddr){ struct inet_ifaddr *ifa; struct flow4 fl; struct route_entry *rt; int err; assert(saddr); //首先對參數進行校驗 if (saddr && saddr->sin_addr.s_addr != INADDR_ANY && saddr->sin_port != 0) { return(EDPVS_OK); /* everything is known, why call this function ? */ } /* if source IP is assiged, we can find ifa->sa_pool * without @daddr and @dev. */ //如果指定了sin_addr if (saddr->sin_addr.s_addr) { //在net_device上擷取對應local ip對應的IP位址資訊塊inet_ifaddr,主要用于擷取sa_pool ifa = inet_addr_ifa_get(AF_INET, dev, (union inet_addr *)&saddr->sin_addr); if (!ifa) { return(EDPVS_NOTEXIST); } //如果沒有配置sa_pool,則傳回出錯 if (!ifa->sa_pool) { RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without sapool.", __func__); inet_addr_ifa_put(ifa); return(EDPVS_INVAL); } //擷取lport err = sa_pool_fetch(sa_pool_hash(ifa->sa_pool, (struct sockaddr_storage *)daddr), (struct sockaddr_storage *)saddr); if (err == EDPVS_OK) { rte_atomic32_inc(&ifa->sa_pool->refcnt); } inet_addr_ifa_put(ifa); return(err); } /* try to find source ifa by @dev and @daddr */ //如果未指定saddr,則首先根據目的位址查找出口路由 memset(&fl, 0, sizeof(struct flow4)); fl.fl4_oif = dev; fl.fl4_daddr.s_addr = daddr ? daddr->sin_addr.s_addr : htonl(INADDR_ANY); fl.fl4_saddr.s_addr = saddr ? saddr->sin_addr.s_addr : htonl(INADDR_ANY); rt = route4_output(&fl); if (!rt) { return(EDPVS_NOROUTE); } /* select source address. */ //選擇一個源ip位址 if (!rt->src.s_addr) { inet_addr_select(AF_INET, rt->port, (union inet_addr *)&rt->dest, RT_SCOPE_UNIVERSE, (union inet_addr *)&rt->src); } //根據選取的源ip擷取對應的ip資訊控制塊 ifa = inet_addr_ifa_get(AF_INET, rt->port, (union inet_addr *)&rt->src); if (!ifa) { route4_put(rt); return(EDPVS_NOTEXIST); } route4_put(rt); if (!ifa->sa_pool) { RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without pool.", __func__); inet_addr_ifa_put(ifa); return(EDPVS_INVAL); } /* do fetch socket address */ err = sa_pool_fetch(sa_pool_hash(ifa->sa_pool, (struct sockaddr_storage *)daddr), (struct sockaddr_storage *)saddr); if (err == EDPVS_OK) { rte_atomic32_inc(&ifa->sa_pool->refcnt); } inet_addr_ifa_put(ifa); return(err);}
sa_pool_hash
主要用于在sa_pool的不同sa_entry_pool哈希桶中查找
/* hash dest's <ip/port>. if no dest provided, just use first pool. */static inline struct sa_entry_pool *sa_pool_hash(const struct sa_pool *ap, const struct sockaddr_storage *ss){ uint32_t hashkey; assert(ap && ap->pool_hash && ap->pool_hash_sz >= 1); if (!ss) { return(&ap->pool_hash[0]); } if (ss->ss_family == AF_INET) { uint16_t vect[2]; const struct sockaddr_in *sin = (const struct sockaddr_in *)ss; vect[0] = ntohl(sin->sin_addr.s_addr) & 0xffff; vect[1] = ntohs(sin->sin_port); hashkey = (vect[0] + vect[1]) % ap->pool_hash_sz; return(&ap->pool_hash[hashkey]); } else if (ss->ss_family == AF_INET6) { uint32_t vect[5] = { 0 }; const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)ss; vect[0] = sin6->sin6_port; memcpy(&vect[1], &sin6->sin6_addr, 16); hashkey = rte_jhash_32b(vect, 5, sin6->sin6_family) % ap->pool_hash_sz; return(&ap->pool_hash[hashkey]); } else { return(NULL); }}
sa_pool_fetch
從free_enties中擷取第一個free sa_entry節點,設定local ip和port
移動到used清單
更新統計計數
static inline int sa_pool_fetch(struct sa_entry_pool *pool, struct sockaddr_storage *ss){ assert(pool && ss); struct sa_entry * ent; struct sockaddr_in * sin = (struct sockaddr_in *)ss; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; //從free_entries連結清單中擷取第一個可用sa_entry ent = list_first_entry_or_null(&pool->free_enties, struct sa_entry, list); if (!ent) {#ifdef CONFIG_DPVS_SAPOOL_DEBUG RTE_LOG(DEBUG, SAPOOL, "%s: no entry (used/free %d/%d)\\n", __func__, pool->used_cnt, pool->free_cnt);#endif pool->miss_cnt++; return(EDPVS_RESOURCE); } //設定local ip和port if (ss->ss_family == AF_INET) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ent->addr.in.s_addr; sin->sin_port = ent->port; } else if (ss->ss_family == AF_INET6) { sin6->sin6_family = AF_INET6; sin6->sin6_addr = ent->addr.in6; sin6->sin6_port = ent->port; } else { return(EDPVS_NOTSUPP); } //标記entry正在使用中 ent->flags |= SA_F_USED; //将其移動到used清單中 list_move_tail(&ent->list, &pool->used_enties); //同時更新使用統計資訊 pool->used_cnt++; pool->free_cnt--; #ifdef CONFIG_DPVS_SAPOOL_DEBUG { char addr[64]; RTE_LOG(DEBUG, SAPOOL, "%s: %s:%d fetched!\\n", __func__, inet_ntop(ss->ss_family, &ent->addr, addr, sizeof(addr)) ? : NULL, ntohs(ent->port)); }#endif return(EDPVS_OK);}
關注我持續更新哦! !
原文連結:https://blog.csdn.net/zjx345438858/article/details/108124950