netdev
[Top] [All Lists]

TX performance of Intel 82546

To: Linux NICS <linux.nics@xxxxxxxxx>
Subject: TX performance of Intel 82546
From: Harald Welte <laforge@xxxxxxxxxxxxx>
Date: Wed, 15 Sep 2004 10:14:39 +0200
Cc: netdev@xxxxxxxxxxx
Sender: netdev-bounce@xxxxxxxxxxx
User-agent: Mutt/1.5.6+20040818i
Hi!

I'm currently trying to help Robert Olsson improving the performance of
the Linux in-kernel packet generator (pktgen.c).  At the moment, we seem
to be unable to get more than 760kpps from a single port of a 82546,
(or any other PCI-X MAC supported by e1000) - that's a bit more than 51%
wirespeed at 64byte packet sizes.

I tried to find out whether this is a software (i.e. linux network
stack, pktgen) problem, or if it is a hardware or driver problem.

To do this, I hardwired some code into the e1000 driver, that
automatically refills the Tx Queue with the same packet over and over
again (see ugly hack attached to this email).  When running in this
hardwired mode, I do not get any E1000_ICR_TXQE events - so apparently
the TX Queue never gets empty, and the 82546 is transferring packets as
fast as possible from host memory.

However, I still don't get more than 760kpps from a single port.

Do you have any further recommendations comments?

Is this 760kpps really a hardware limitation?  Is it limited by the
82546, the PCI-X latency/bandwidth or memory latency/bandwith?

Did Intel ever achieve higher tx pps rates with the 82546 MAC? If yes,
on which hardware and OS ?

Thanks for your help.


Hardware:

MSI K8D Master-F, Dual Opteron 1.4GHz, 1GB RAM, PC-2700 (DDR-333), all on CPU1

0000:02:03.0 Ethernet controller: Intel Corp. 82546GB Gigabit Ethernet 
Controller (rev 03)
        Subsystem: Intel Corp. PRO/1000 MT Dual Port Network Connection
        Flags: bus master, 66MHz, medium devsel, latency 64, IRQ 24
        Memory at fc9c0000 (64-bit, non-prefetchable) [size=128K]
        Memory at fc900000 (64-bit, non-prefetchable) [size=256K]
        I/O ports at a880 [size=64]
        Expansion ROM at fc8c0000 [disabled] [size=256K]
        Capabilities: [dc] Power Management version 2
        Capabilities: [e4]      Capabilities: [f0] Message Signalled 
Interrupts: 64bit+ Queue=0/0 Enable-


Software

linux-2.6.8.1
- modified e1000 to keep re-filling 2048 tx descriptors with same skb
- board does not generate any 'tx queue empty' interrupts
        - thus, TX is running at full PC/HT/memory speed

TxDescriptors=2048,2048 FlowControl=0,0 Speed=1000,1000 TxIntDelay=0,0 
TxAbsIntDelay=0,0 InterruptThottleRate=0,0

No more than 763kpps possible :(

tried (no improvement):
        - IntDelay, AbsIntDelay, dynamic ThrottleRate
        - enabling NAPI (disables TX interrupt)
                -> tx queue runs empty
        - smp_affinity to other cpu (not sure which has local ram)
                -> only 714kpps
        - increase skb to 128byte
                - 746kpps

---

diff -Nru linux-2.6.8.1/drivers/net/e1000/e1000.h 
linux-2.6.8.1-test/drivers/net/e1000/e1000.h
--- linux-2.6.8.1/drivers/net/e1000/e1000.h     2004-08-14 10:54:47.000000000 
+0000
+++ linux-2.6.8.1-test/drivers/net/e1000/e1000.h        2004-09-13 
16:37:12.000000000 +0000
@@ -202,6 +202,7 @@
        spinlock_t stats_lock;
        atomic_t irq_sem;
        struct work_struct tx_timeout_task;
+       struct work_struct tx_pktgen_task;
        uint8_t fc_autoneg;
 
        struct timer_list blink_timer;
diff -Nru linux-2.6.8.1/drivers/net/e1000/e1000_main.c 
linux-2.6.8.1-test/drivers/net/e1000/e1000_main.c
--- linux-2.6.8.1/drivers/net/e1000/e1000_main.c        2004-08-14 
10:55:10.000000000 +0000
+++ linux-2.6.8.1-test/drivers/net/e1000/e1000_main.c   2004-09-13 
21:19:13.996635320 +0000
@@ -111,6 +111,9 @@
 void e1000_update_stats(struct e1000_adapter *adapter);
 
 /* Local Function Prototypes */
+static struct sk_buff *test_dummy_skb(unsigned int size);
+static void test_refill_tx_queue(struct e1000_adapter *adapter);
+static void test_tx_pktgen_task(struct net_device *netdev);
 
 static int e1000_init_module(void);
 static void e1000_exit_module(void);
@@ -273,6 +276,9 @@
        mod_timer(&adapter->watchdog_timer, jiffies);
        e1000_irq_enable(adapter);
 
+       test_dummy_skb(60);
+       test_refill_tx_queue(adapter);
+
        return 0;
 }
 
@@ -281,6 +287,8 @@
 {
        struct net_device *netdev = adapter->netdev;
 
+       printk("%s: entering\n", __FUNCTION__);
+
        e1000_irq_disable(adapter);
        free_irq(adapter->pdev->irq, netdev);
        del_timer_sync(&adapter->tx_fifo_stall_timer);
@@ -533,6 +541,9 @@
        INIT_WORK(&adapter->tx_timeout_task,
                (void (*)(void *))e1000_tx_timeout_task, netdev);
 
+       INIT_WORK(&adapter->tx_pktgen_task,
+               (void (*)(void *))test_tx_pktgen_task, netdev);
+
        /* we're going to reset, so assume we have no link for now */
 
        netif_carrier_off(netdev);
@@ -765,6 +776,7 @@
 {
        struct e1000_adapter *adapter = netdev->priv;
 
+       printk("%s: entering\n", __FUNCTION__);
        e1000_down(adapter);
 
        e1000_free_tx_resources(adapter);
@@ -1070,13 +1082,15 @@
 
        for(i = 0; i < tx_ring->count; i++) {
                buffer_info = &tx_ring->buffer_info[i];
-               if(buffer_info->skb) {
-
+               if(buffer_info->dma) {
                        pci_unmap_page(pdev,
                                       buffer_info->dma,
                                       buffer_info->length,
                                       PCI_DMA_TODEVICE);
+                       buffer_info->dma = 0;
+               }
 
+               if(buffer_info->skb) {
                        dev_kfree_skb(buffer_info->skb);
 
                        buffer_info->skb = NULL;
@@ -1434,6 +1448,7 @@
                         * but we've got queued Tx work that's never going
                         * to get done, so reset controller to flush Tx.
                         * (Do the reset outside of interrupt context). */
+                       printk("%s: scheduling timeout\n", __FUNCTION__);
                        schedule_work(&adapter->tx_timeout_task);
                }
        }
@@ -1555,6 +1570,7 @@
 #define E1000_MAX_TXD_PWR      12
 #define E1000_MAX_DATA_PER_TXD (1<<E1000_MAX_TXD_PWR)
 
+
 static inline int
 e1000_tx_map(struct e1000_adapter *adapter, struct sk_buff *skb,
        unsigned int first, unsigned int max_per_txd,
@@ -1754,6 +1770,13 @@
                return 0;
        }
 
+#if 0
+       /* don't send any packets */
+       dev_kfree_skb_any(skb);
+       netdev->trans_start = jiffies;
+       return 0;
+#endif
+
 #ifdef NETIF_F_TSO
        mss = skb_shinfo(skb)->tso_size;
        /* The controller does a simple calculation to 
@@ -1791,6 +1814,8 @@
        if(E1000_DESC_UNUSED(&adapter->tx_ring) < count + 2 ) {
                netif_stop_queue(netdev);
                spin_unlock_irqrestore(&adapter->tx_lock, flags);
+               if (net_ratelimit())
+                       printk(KERN_DEBUG "err: no unused descriptors\n");
                return 1;
        }
        spin_unlock_irqrestore(&adapter->tx_lock, flags);
@@ -1834,6 +1859,7 @@
 {
        struct e1000_adapter *adapter = netdev->priv;
 
+       printk("%s: entering\n", __FUNCTION__);
        /* Do the reset outside of interrupt context */
        schedule_work(&adapter->tx_timeout_task);
 }
@@ -1843,6 +1869,7 @@
 {
        struct e1000_adapter *adapter = netdev->priv;
 
+       printk("%s: entering\n", __FUNCTION__);
        netif_device_detach(netdev);
        e1000_down(adapter);
        e1000_up(adapter);
@@ -2078,6 +2105,8 @@
 {
        if(atomic_dec_and_test(&adapter->irq_sem)) {
                E1000_WRITE_REG(&adapter->hw, IMS, IMS_ENABLE_MASK);
+               /* disable RX interrupt generation */
+               //E1000_WRITE_REG(&adapter->hw, IMS, IMS_ENABLE_MASK & 
~E1000_IMS_RXDMT0);
                E1000_WRITE_FLUSH(&adapter->hw);
        }
 }
@@ -2103,11 +2132,27 @@
        if(!icr)
                return IRQ_NONE;  /* Not our interrupt */
 
+#if 0
+       printk("e1000_intr: icr=0x%08x\n", icr);
+       printk("%s: tdh=%d, tdt=%d\n", __FUNCTION__,
+               E1000_READ_REG(&adapter->hw, TDH),
+               E1000_READ_REG(&adapter->hw, TDT));
+#endif
+
        if(icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
                hw->get_link_status = 1;
                mod_timer(&adapter->watchdog_timer, jiffies);
        }
 
+       if (icr & E1000_ICR_TXQE) {
+               printk("TX queue empty: shouldn't happen!\n");
+       }
+
+       if (icr & E1000_ICR_TXDW) {
+               e1000_clean_tx_irq(adapter);
+               schedule_work(&adapter->tx_pktgen_task);
+       }
+
 #ifdef CONFIG_E1000_NAPI
        if(netif_rx_schedule_prep(netdev)) {
 
@@ -2116,7 +2161,7 @@
                */
 
                atomic_inc(&adapter->irq_sem);
-               E1000_WRITE_REG(hw, IMC, ~0);
+//             E1000_WRITE_REG(hw, IMC, ~0);
                __netif_rx_schedule(netdev);
        }
 #else
@@ -2142,6 +2187,7 @@
        int work_to_do = min(*budget, netdev->quota);
        int work_done = 0;
        
+       //printk("%s entered\n", __FUNCTION__);
        e1000_clean_tx_irq(adapter);
        e1000_clean_rx_irq(adapter, &work_done, work_to_do);
 
@@ -2175,6 +2221,7 @@
        boolean_t cleaned = FALSE;
 
 
+       //printk("%s entered\n", __FUNCTION__);
        i = tx_ring->next_to_clean;
        eop = tx_ring->buffer_info[i].next_to_watch;
        eop_desc = E1000_TX_DESC(*tx_ring, eop);
@@ -2184,6 +2231,7 @@
                for(cleaned = FALSE; !cleaned; ) {
                        tx_desc = E1000_TX_DESC(*tx_ring, i);
                        buffer_info = &tx_ring->buffer_info[i];
+                       //printk("cleaning tx_desc %d\n", i);
 
                        if(buffer_info->dma) {
 
@@ -2802,6 +2850,7 @@
        uint32_t ctrl, ctrl_ext, rctl, manc, status;
        uint32_t wufc = adapter->wol;
 
+       printk("%s: entering\n", __FUNCTION__);
        netif_device_detach(netdev);
 
        if(netif_running(netdev))
@@ -2920,4 +2969,169 @@
 }
 #endif
 
+
+#include <linux/ip.h>
+#include <linux/udp.h>
+static struct sk_buff *test_skb;
+
+static struct sk_buff *
+test_dummy_skb(unsigned int pkt_size)
+{
+       int datalen;
+       struct sk_buff *skb;
+       __u8 *eth;
+       struct iphdr *iph;
+       struct udphdr *udph;
+       
+       skb = alloc_skb(pkt_size + 64 + 16, GFP_ATOMIC);
+
+       if (!skb)
+               return NULL;
+
+       /* increase reference count so noone can free it */
+       skb_get(skb);
+
+       skb_reserve(skb, 16);
+       eth = (__u8 *) skb_push(skb, 14);
+       iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
+       udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
+       memset(eth, 1, 6);      // dst
+       memset(eth+6, 2, 6);    // src
+       memset(eth+12, 0x08, 1);
+       memset(eth+13, 0x00, 1);
+
+       datalen = pkt_size - 14 - 20 - 8;
+
+       udph->source = htons(9);
+       udph->dest = htons(9);
+       udph->len = htons(datalen + 8);
+       udph->check = 0;
+
+       iph->ihl = 5;
+       iph->version = 4;
+       iph->ttl = 3;
+       iph->tos = 0;
+       iph->protocol = IPPROTO_UDP;
+       iph->saddr = htonl(0x01010101);
+       iph->daddr = htonl(0x02020202);
+       iph->frag_off = 0;
+       iph->tot_len = htons(20+8+datalen);
+       iph->check = 0;
+       iph->check = ip_fast_csum((void *) iph, iph->ihl);
+       skb->protocol = __constant_htons(ETH_P_IP);
+       skb->mac.raw = ((u8 *)iph) - 14;
+       skb->pkt_type = PACKET_HOST;
+
+       test_skb = skb;
+
+       return skb;
+}
+
+static inline void
+test_tx_queue(struct e1000_adapter *adapter, int count, int tx_flags)
+{
+       struct e1000_desc_ring *tx_ring = &adapter->tx_ring;
+       struct e1000_tx_desc *tx_desc = NULL;
+       struct e1000_buffer *buffer_info;
+       uint32_t txd_upper = 0, txd_lower = E1000_TXD_CMD_IFCS;
+       unsigned int i;
+
+       if(tx_flags & E1000_TX_FLAGS_TSO) {
+               txd_lower |= E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
+                            E1000_TXD_CMD_TSE;
+               txd_upper |= (E1000_TXD_POPTS_IXSM | E1000_TXD_POPTS_TXSM) << 8;
+       }
+
+       if(tx_flags & E1000_TX_FLAGS_CSUM) {
+               txd_lower |= E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
+               txd_upper |= E1000_TXD_POPTS_TXSM << 8;
+       }
+
+       if(tx_flags & E1000_TX_FLAGS_VLAN) {
+               txd_lower |= E1000_TXD_CMD_VLE;
+               txd_upper |= (tx_flags & E1000_TX_FLAGS_VLAN_MASK);
+       }
+
+       i = tx_ring->next_to_use;
+
+       while(count--) {
+               buffer_info = &tx_ring->buffer_info[i];
+               tx_desc = E1000_TX_DESC(*tx_ring, i);
+               tx_desc->buffer_addr = cpu_to_le64(buffer_info->dma);
+               tx_desc->lower.data =
+                       cpu_to_le32(txd_lower | buffer_info->length);
+               tx_desc->upper.data = cpu_to_le32(txd_upper);
+               if(++i == tx_ring->count) i = 0;
+       }
+
+       tx_desc->lower.data |= cpu_to_le32(adapter->txd_cmd);
+
+       /* Force memory writes to complete before letting h/w
+        * know there are new descriptors to fetch.  (Only
+        * applicable for weak-ordered memory model archs,
+        * such as IA-64). */
+       wmb();
+
+       tx_ring->next_to_use = i;
+}
+
+static void
+test_refill_tx_queue(struct e1000_adapter *adapter)
+{
+       int i, num;
+       unsigned long flags;
+       int reserve = 2 + 10; 
+       
+       spin_lock_irqsave(&adapter->tx_lock, flags);
+       num = E1000_DESC_UNUSED(&adapter->tx_ring);
+
+       if (num <= reserve) {
+               printk("too little unused descriptors to refill\n");
+               spin_unlock_irqrestore(&adapter->tx_lock, flags);
+               return;
+       }
+
+       //printk("%s: refilling %d descriptors\n", __FUNCTION__, num - reserve);
+
+       i = 0;
+       while (1) {
+               int ret, skb_idx;
+               if (i >= num-reserve)
+                       break;
+
+       //      printk("e1000_tx_map(%d)\n", adapter->tx_ring.next_to_use);
+               ret = e1000_tx_map(adapter, test_skb, 
+                                       adapter->tx_ring.next_to_use,
+                                      E1000_MAX_DATA_PER_TXD,
+                                      skb_shinfo(test_skb)->nr_frags,
+                                      skb_shinfo(test_skb)->tso_size);
+               skb_idx = 
adapter->tx_ring.buffer_info[adapter->tx_ring.next_to_use].next_to_watch;
+               adapter->tx_ring.buffer_info[skb_idx].skb = NULL;
+               test_tx_queue(adapter, ret, 0);
+               i += ret;
+       }
+
+#if 0
+       printk("%s: tdh=%d, tdt=%d\n", __FUNCTION__,
+               E1000_READ_REG(&adapter->hw, TDH),
+               E1000_READ_REG(&adapter->hw, TDT));
+#endif
+       E1000_WRITE_REG(&adapter->hw, TDT, adapter->tx_ring.next_to_use);
+#if 0
+       printk("%s: tdh=%d, tdt=%d\n", __FUNCTION__,
+               E1000_READ_REG(&adapter->hw, TDH),
+               E1000_READ_REG(&adapter->hw, TDT));
+#endif
+
+       spin_unlock_irqrestore(&adapter->tx_lock, flags);
+}
+
+static void
+test_tx_pktgen_task(struct net_device *netdev)
+{
+       struct e1000_adapter *adapter = netdev->priv;
+       test_refill_tx_queue(adapter);
+}
+
+
 /* e1000_main.c */



-- 
- Harald Welte <laforge@xxxxxxxxxxxxx>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

Attachment: signature.asc
Description: Digital signature

<Prev in Thread] Current Thread [Next in Thread>