On 11/01/2017 05:36 PM, David Daney wrote:
From: Carlos Munoz <cmunoz@xxxxxxxxxx>
The Cavium OCTEON cn78xx and cn73xx SoCs have network packet I/O
hardware that is significantly different from previous generations of
the family.
Add a new driver for this hardware. The Ethernet MAC is called BGX on
these devices. Common code for the MAC is in octeon3-bgx-port.c.
Four of these BGX MACs are grouped together and managed as a group by
octeon3-bgx-nexus.c. Ingress packet classification is done by the PKI
unit initialized in octeon3-pki.c. Queue management is done in the
SSO, initialized by octeon3-sso.c. Egress is handled by the PKO,
initialized in octeon3-pko.c.
Signed-off-by: Carlos Munoz <cmunoz@xxxxxxxxxx>
Signed-off-by: Steven J. Hill <Steven.Hill@xxxxxxxxxx>
Signed-off-by: David Daney <david.daney@xxxxxxxxxx>
---
+static char *mix_port;
+module_param(mix_port, charp, 0444);
+MODULE_PARM_DESC(mix_port, "Specifies which ports connect to MIX interfaces.");
Can you derive this from Device Tree /platform data configuration?
+
+static char *pki_port;
+module_param(pki_port, charp, 0444);
+MODULE_PARM_DESC(pki_port, "Specifies which ports connect to the PKI.");
Likewise
+
+#define MAX_MIX_PER_NODE 2
+
+#define MAX_MIX (MAX_NODES * MAX_MIX_PER_NODE)
+
+/**
+ * struct mix_port_lmac - Describes a lmac that connects to a mix
+ * port. The lmac must be on the same node as
+ * the mix.
+ * @node: Node of the lmac.
+ * @bgx: Bgx of the lmac.
+ * @lmac: Lmac index.
+ */
+struct mix_port_lmac {
+ int node;
+ int bgx;
+ int lmac;
+};
+
+/* mix_ports_lmacs contains all the lmacs connected to mix ports */
+static struct mix_port_lmac mix_port_lmacs[MAX_MIX];
+
+/* pki_ports keeps track of the lmacs connected to the pki */
+static bool pki_ports[MAX_NODES][MAX_BGX_PER_NODE][MAX_LMAC_PER_BGX];
+
+/* Created platform devices get added to this list */
+static struct list_head pdev_list;
+static struct mutex pdev_list_lock;
+
+/* Created platform device use this structure to add themselves to the list */
+struct pdev_list_item {
+ struct list_head list;
+ struct platform_device *pdev;
+};
Don't you have a top-level platform device that you could use which
would hold this data instead of having it here?
[snip]
+/* Registers are accessed via xkphys */
+#define SSO_BASE 0x1670000000000ull
+#define SSO_ADDR(node) (SET_XKPHYS + NODE_OFFSET(node) + \
+ SSO_BASE)
+#define GRP_OFFSET(grp) ((grp) << 16)
+#define GRP_ADDR(n, g) (SSO_ADDR(n) + GRP_OFFSET(g))
+#define SSO_GRP_AQ_CNT(n, g) (GRP_ADDR(n, g) + 0x20000700)
+
+#define MIO_PTP_BASE 0x1070000000000ull
+#define MIO_PTP_ADDR(node) (SET_XKPHYS + NODE_OFFSET(node) + \
+ MIO_PTP_BASE)
+#define MIO_PTP_CLOCK_CFG(node) (MIO_PTP_ADDR(node) + 0xf00)
+#define MIO_PTP_CLOCK_HI(node) (MIO_PTP_ADDR(node) + 0xf10)
+#define MIO_PTP_CLOCK_COMP(node) (MIO_PTP_ADDR(node) + 0xf18)
I am sure this will work great on anything but MIPS64 ;)
+
+struct octeon3_ethernet;
+
+struct octeon3_rx {
+ struct napi_struct napi;
+ struct octeon3_ethernet *parent;
+ int rx_grp;
+ int rx_irq;
+ cpumask_t rx_affinity_hint;
+} ____cacheline_aligned_in_smp;
+
+struct octeon3_ethernet {
+ struct bgx_port_netdev_priv bgx_priv; /* Must be first element. */
+ struct list_head list;
+ struct net_device *netdev;
+ enum octeon3_mac_type mac_type;
+ struct octeon3_rx rx_cxt[MAX_RX_QUEUES];
+ struct ptp_clock_info ptp_info;
+ struct ptp_clock *ptp_clock;
+ struct cyclecounter cc;
+ struct timecounter tc;
+ spinlock_t ptp_lock; /* Serialize ptp clock adjustments */
+ int num_rx_cxt;
+ int pki_aura;
+ int pknd;
+ int pko_queue;
+ int node;
+ int interface;
+ int index;
+ int rx_buf_count;
+ int tx_complete_grp;
+ int rx_timestamp_hw:1;
+ int tx_timestamp_hw:1;
+ spinlock_t stat_lock; /* Protects stats counters */
+ u64 last_packets;
+ u64 last_octets;
+ u64 last_dropped;
+ atomic64_t rx_packets;
+ atomic64_t rx_octets;
+ atomic64_t rx_dropped;
+ atomic64_t rx_errors;
+ atomic64_t rx_length_errors;
+ atomic64_t rx_crc_errors;
+ atomic64_t tx_packets;
+ atomic64_t tx_octets;
+ atomic64_t tx_dropped;
Do you really need those to be truly atomic64_t types, can't you use u64
and use the helpers from u64_stats_sync.h? That should be good enough?
+ /* The following two fields need to be on a different cache line as
+ * they are updated by pko which invalidates the cache every time it
+ * updates them. The idea is to prevent other fields from being
+ * invalidated unnecessarily.
+ */
+ char cacheline_pad1[CVMX_CACHE_LINE_SIZE];
+ atomic64_t buffers_needed;
+ atomic64_t tx_backlog;
+ char cacheline_pad2[CVMX_CACHE_LINE_SIZE];
+};
+
+static DEFINE_MUTEX(octeon3_eth_init_mutex);
+
+struct octeon3_ethernet_node;
+
+struct octeon3_ethernet_worker {
+ wait_queue_head_t queue;
+ struct task_struct *task;
+ struct octeon3_ethernet_node *oen;
+ atomic_t kick;
+ int order;
+};
+
+struct octeon3_ethernet_node {
+ bool init_done;
+ int next_cpu_irq_affinity;
+ int node;
+ int pki_packet_pool;
+ int sso_pool;
+ int pko_pool;
+ void *sso_pool_stack;
+ void *pko_pool_stack;
+ void *pki_packet_pool_stack;
+ int sso_aura;
+ int pko_aura;
+ int tx_complete_grp;
+ int tx_irq;
+ cpumask_t tx_affinity_hint;
+ struct octeon3_ethernet_worker workers[8];
+ struct mutex device_list_lock; /* Protects the device list */
+ struct list_head device_list;
+ spinlock_t napi_alloc_lock; /* Protects napi allocations */
+};
+
+static int wait_pko_response;
+module_param(wait_pko_response, int, 0644);
+MODULE_PARM_DESC(wait_pko_response, "Wait for response after each pko command.");
Under which circumstances is this used?
+
+static int num_packet_buffers = 768;
+module_param(num_packet_buffers, int, 0444);
+MODULE_PARM_DESC(num_packet_buffers,
+ "Number of packet buffers to allocate per port.");
Consider implementing ethtool -g/G for this.
+
+static int packet_buffer_size = 2048;
+module_param(packet_buffer_size, int, 0444);
+MODULE_PARM_DESC(packet_buffer_size, "Size of each RX packet buffer.");
How is that different from adjusting the network device's MTU?
+
+static int rx_queues = 1;
+module_param(rx_queues, int, 0444);
+MODULE_PARM_DESC(rx_queues, "Number of RX threads per port.");
Same thing, can you consider using an ethtool knob for that?
+
+int ilk0_lanes = 1;
+module_param(ilk0_lanes, int, 0444);
+MODULE_PARM_DESC(ilk0_lanes, "Number of SerDes lanes used by ILK link 0.");
+
+int ilk1_lanes = 1;
+module_param(ilk1_lanes, int, 0444);
+MODULE_PARM_DESC(ilk1_lanes, "Number of SerDes lanes used by ILK link 1.");
+
+static struct octeon3_ethernet_node octeon3_eth_node[MAX_NODES];
+static struct kmem_cache *octeon3_eth_sso_pko_cache;
+
+/**
+ * Reads a 64 bit value from the processor local scratchpad memory.
+ *
+ * @param offset byte offset into scratch pad to read
+ *
+ * @return value read
+ */
+static inline u64 scratch_read64(u64 offset)
+{
+ return *(u64 *)((long)SCRATCH_BASE + offset);
+}
No barriers needed whatsoever?
+
+/**
+ * Write a 64 bit value to the processor local scratchpad memory.
+ *
+ * @param offset byte offset into scratch pad to write
+ @ @praram value to write
+ */
+static inline void scratch_write64(u64 offset, u64 value)
+{
+ *(u64 *)((long)SCRATCH_BASE + offset) = value;
+}
+
+static int get_pki_chan(int node, int interface, int index)
+{
+ int pki_chan;
+
+ pki_chan = node << 12;
+
+ if (OCTEON_IS_MODEL(OCTEON_CNF75XX) &&
+ (interface == 1 || interface == 2)) {
+ /* SRIO */
+ pki_chan |= 0x240 + (2 * (interface - 1)) + index;
+ } else {
+ /* BGX */
+ pki_chan |= 0x800 + (0x100 * interface) + (0x10 * index);
+ }
+
+ return pki_chan;
+}
+
+/* Map auras to the field priv->buffers_needed. Used to speed up packet
+ * transmission.
+ */
+static void *aura2bufs_needed[MAX_NODES][FPA3_NUM_AURAS];
+
+static int octeon3_eth_lgrp_to_ggrp(int node, int grp)
+{
+ return (node << 8) | grp;
+}
+
+static void octeon3_eth_gen_affinity(int node, cpumask_t *mask)
+{
+ int cpu;
+
+ do {
+ cpu = cpumask_next(octeon3_eth_node[node].next_cpu_irq_affinity, cpu_online_mask);
+ octeon3_eth_node[node].next_cpu_irq_affinity++;
+ if (cpu >= nr_cpu_ids) {
+ octeon3_eth_node[node].next_cpu_irq_affinity = -1;
+ continue;
+ }
+ } while (false);
+ cpumask_clear(mask);
+ cpumask_set_cpu(cpu, mask);
+}
+
+struct wr_ret {
+ void *work;
+ u16 grp;
+};
+
+static inline struct wr_ret octeon3_core_get_work_sync(int grp)
+{
+ u64 node = cvmx_get_node_num();
+ u64 addr;
+ u64 response;
+ struct wr_ret r;
+
+ /* See SSO_GET_WORK_LD_S for the address to read */
+ addr = 1ull << 63;
+ addr |= BIT(48);
+ addr |= DID_TAG_SWTAG << 40;
+ addr |= node << 36;
+ addr |= BIT(30);
+ addr |= BIT(29);
+ addr |= octeon3_eth_lgrp_to_ggrp(node, grp) << 4;
+ addr |= SSO_NO_WAIT << 3;
+ response = __raw_readq((void __iomem *)addr);
+
+ /* See SSO_GET_WORK_RTN_S for the format of the response */
+ r.grp = (response & GENMASK_ULL(57, 48)) >> 48;
+ if (response & BIT(63))
+ r.work = NULL;
+ else
+ r.work = phys_to_virt(response & GENMASK_ULL(41, 0));
There are quite a lot of phys_to_virt() and virt_to_phys() uses
throughout this driver, this will likely not work on anything but
MIPS64, so there should be a better way, abstracted to do this, see below.
+
+ return r;
+}
+
+/**
+ * octeon3_core_get_work_async - Request work via a iobdma command. Doesn't wait
+ * for the response.
+ *
+ * @grp: Group to request work for.
+ */
+static inline void octeon3_core_get_work_async(unsigned int grp)
+{
+ u64 data;
+ u64 node = cvmx_get_node_num();
+
+ /* See SSO_GET_WORK_DMA_S for the command structure */
+ data = SCR_SCRATCH << 56;
+ data |= 1ull << 48;
+ data |= DID_TAG_SWTAG << 40;
+ data |= node << 36;
+ data |= 1ull << 30;
+ data |= 1ull << 29;
+ data |= octeon3_eth_lgrp_to_ggrp(node, grp) << 4;
+ data |= SSO_NO_WAIT << 3;
+
+ __raw_writeq(data, (void __iomem *)IOBDMA_SENDSINGLE);
+}
+
+/**
+ * octeon3_core_get_response_async - Read the request work response. Must be
+ * called after calling
+ * octeon3_core_get_work_async().
+ *
+ * Returns work queue entry.
+ */
+static inline struct wr_ret octeon3_core_get_response_async(void)
+{
+ struct wr_ret r;
+ u64 response;
+
+ CVMX_SYNCIOBDMA;
+ response = scratch_read64(SCR_SCRATCH);
+
+ /* See SSO_GET_WORK_RTN_S for the format of the response */
+ r.grp = (response & GENMASK_ULL(57, 48)) >> 48;
+ if (response & BIT(63))
+ r.work = NULL;
+ else
+ r.work = phys_to_virt(response & GENMASK_ULL(41, 0));
+
+ return r;
+}
+
+static void octeon3_eth_replenish_rx(struct octeon3_ethernet *priv, int count)
+{
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ void **buf;
+
+ skb = __alloc_skb(packet_buffer_size, GFP_ATOMIC, 0, priv->node);
+ if (!skb)
+ break;
+ buf = (void **)PTR_ALIGN(skb->head, 128);
+ buf[SKB_PTR_OFFSET] = skb;
Can you use build_skb()?
+ octeon_fpa3_free(priv->node, priv->pki_aura, buf);
+ }
+}
+
[snip]
+static int octeon3_eth_tx_complete_worker(void *data)
+{
+ struct octeon3_ethernet_worker *worker = data;
+ struct octeon3_ethernet_node *oen = worker->oen;
+ int backlog;
+ int order = worker->order;
+ int tx_complete_stop_thresh = order * 100;
+ int backlog_stop_thresh = order == 0 ? 31 : order * 80;
+ u64 aq_cnt;
+ int i;
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(worker->queue, octeon3_eth_tx_complete_runnable(worker));
+ atomic_dec_if_positive(&worker->kick); /* clear the flag */
+
+ do {
+ backlog = octeon3_eth_replenish_all(oen);
+ for (i = 0; i < 100; i++) {
Do you really want to bound your TX reclamation worker, all other
network drivers never bound their napi TX completion task and instead
reclaim every transmitted buffers.
+ void **work;
+ struct net_device *tx_netdev;
+ struct octeon3_ethernet *tx_priv;
+ struct sk_buff *skb;
+ struct wr_ret r;
+
+ r = octeon3_core_get_work_sync(oen->tx_complete_grp);
+ work = r.work;
+ if (!work)
+ break;
+ tx_netdev = work[0];
+ tx_priv = netdev_priv(tx_netdev);
+ if (unlikely(netif_queue_stopped(tx_netdev)) &&
+ atomic64_read(&tx_priv->tx_backlog) < MAX_TX_QUEUE_DEPTH)
+ netif_wake_queue(tx_netdev);
+ skb = container_of((void *)work, struct sk_buff, cb);
+ if (unlikely(tx_priv->tx_timestamp_hw) &&
+ unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
+ octeon3_eth_tx_complete_hwtstamp(tx_priv, skb);
This is where you should be incremeting the TX bytes and packets
statistcs, not in your ndo_start_xmit() like you are currently doing.
+ dev_kfree_skb(skb);
Consider using dev_consume_skb() to be drop-monitor friendly.
+ }
+
+ aq_cnt = oct_csr_read(SSO_GRP_AQ_CNT(oen->node, oen->tx_complete_grp));
+ aq_cnt &= GENMASK_ULL(32, 0);
+ if ((backlog > backlog_stop_thresh || aq_cnt > tx_complete_stop_thresh) &&
+ order < ARRAY_SIZE(oen->workers) - 1) {
+ atomic_set(&oen->workers[order + 1].kick, 1);
+ wake_up(&oen->workers[order + 1].queue);
+ }
+ } while (!need_resched() &&
+ (backlog > backlog_stop_thresh ||
+ aq_cnt > tx_complete_stop_thresh));
+
+ cond_resched();
+
+ if (!octeon3_eth_tx_complete_runnable(worker))
+ octeon3_sso_irq_set(oen->node, oen->tx_complete_grp, true);
+ }
+
+ return 0;
+}
+
+static irqreturn_t octeon3_eth_tx_handler(int irq, void *info)
+{
+ struct octeon3_ethernet_node *oen = info;
+ /* Disarm the irq. */
+ octeon3_sso_irq_set(oen->node, oen->tx_complete_grp, false);
+ atomic_set(&oen->workers[0].kick, 1);
+ wake_up(&oen->workers[0].queue);
Can you move the whole worker to a NAPI context/softirq context?
[...]
+ return IRQ_HANDLED;
+}
+
+
+ /* Strip the ethernet fcs */
+ pskb_trim(skb, skb->len - 4);
+ }
+
+ if (likely(priv->netdev->flags & IFF_UP)) {
+ skb_checksum_none_assert(skb);
+ if (unlikely(priv->rx_timestamp_hw)) {
+ /* The first 8 bytes are the timestamp */
+ u64 hwts = *(u64 *)skb->data;
+ u64 ns;
+ struct skb_shared_hwtstamps *shts;
+
+ ns = timecounter_cyc2time(&priv->tc, hwts);
+ shts = skb_hwtstamps(skb);
+ memset(shts, 0, sizeof(*shts));
+ shts->hwtstamp = ns_to_ktime(ns);
+ __skb_pull(skb, 8);
+ }
+
+ skb->protocol = eth_type_trans(skb, priv->netdev);
+ skb->dev = priv->netdev;
+ if (priv->netdev->features & NETIF_F_RXCSUM) {
+ if ((work->word2.lc_hdr_type == PKI_LTYPE_IP4 ||
+ work->word2.lc_hdr_type == PKI_LTYPE_IP6) &&
+ (work->word2.lf_hdr_type == PKI_LTYPE_TCP ||
+ work->word2.lf_hdr_type == PKI_LTYPE_UDP ||
+ work->word2.lf_hdr_type == PKI_LTYPE_SCTP))
+ if (work->word2.err_code == 0)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ napi_gro_receive(&rx->napi, skb);
+ } else {
+ /* Drop any packet received for a device that isn't up */
If that happens, is not that a blatant indication that there is a bug in
how the interface is brought down?
+ atomic64_inc(&priv->rx_dropped);
+ dev_kfree_skb_any(skb);
+ }
+out:
+ return ret;
+}
+
+static int octeon3_eth_napi(struct napi_struct *napi, int budget)
+{
+ int rx_count = 0;
+ struct octeon3_rx *cxt;
+ struct octeon3_ethernet *priv;
+ u64 aq_cnt;
+ int n = 0;
+ int n_bufs = 0;
+ u64 old_scratch;
+
+ cxt = container_of(napi, struct octeon3_rx, napi);
+ priv = cxt->parent;
+
+ /* Get the amount of work pending */
+ aq_cnt = oct_csr_read(SSO_GRP_AQ_CNT(priv->node, cxt->rx_grp));
+ aq_cnt &= GENMASK_ULL(32, 0);
+
+ if (likely(USE_ASYNC_IOBDMA)) {
+ /* Save scratch in case userspace is using it */
+ CVMX_SYNCIOBDMA;
+ old_scratch = scratch_read64(SCR_SCRATCH);
+
+ octeon3_core_get_work_async(cxt->rx_grp);
+ }
+
+ while (rx_count < budget) {
+ n = 0;
+
+ if (likely(USE_ASYNC_IOBDMA)) {
+ bool req_next = rx_count < (budget - 1) ? true : false;
+
+ n = octeon3_eth_rx_one(cxt, true, req_next);
+ } else {
+ n = octeon3_eth_rx_one(cxt, false, false);
+ }
+
+ if (n == 0)
+ break;
+
+ n_bufs += n;
+ rx_count++;
+ }
+
+ /* Wake up worker threads */
+ n_bufs = atomic64_add_return(n_bufs, &priv->buffers_needed);
+ if (n_bufs >= 32) {
+ struct octeon3_ethernet_node *oen;
+
+ oen = octeon3_eth_node + priv->node;
+ atomic_set(&oen->workers[0].kick, 1);
+ wake_up(&oen->workers[0].queue);
+ }
+
+ /* Stop the thread when no work is pending */
+ if (rx_count < budget) {
+ napi_complete(napi);
+ octeon3_sso_irq_set(cxt->parent->node, cxt->rx_grp, true);
+ }
+
+ if (likely(USE_ASYNC_IOBDMA)) {
+ /* Restore the scratch area */
+ scratch_write64(SCR_SCRATCH, old_scratch);
+ }
+
+ return rx_count;
+}
+
+#undef BROKEN_SIMULATOR_CSUM
+
+static void ethtool_get_drvinfo(struct net_device *netdev,
+ struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, "octeon3-ethernet");
+ strcpy(info->version, "1.0");
+ strcpy(info->bus_info, "Builtin");
I believe the correct way to specify that type of bus is to use "platform".
[snip]
+static int octeon3_eth_common_ndo_stop(struct net_device *netdev)
+{
+ struct octeon3_ethernet *priv = netdev_priv(netdev);
+ void **w;
+ struct sk_buff *skb;
+ struct octeon3_rx *rx;
+ int i;
+
+ /* Allow enough time for ingress in transit packets to be drained */
+ msleep(20);
Can you find a better way to do that? You can put a hard disable on the
hardware, and then wait until a particular condition to indicate full
drainage?
[snip]
+static int octeon3_eth_ndo_start_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct sk_buff *skb_tmp;
+ struct octeon3_ethernet *priv = netdev_priv(netdev);
+ u64 scr_off = LMTDMA_SCR_OFFSET;
+ u64 pko_send_desc;
+ u64 lmtdma_data;
+ u64 aq_cnt = 0;
+ struct octeon3_ethernet_node *oen;
+ long backlog;
+ int frag_count;
+ u64 head_len;
+ int i;
+ u64 *dma_addr;
dma_addr_t?
+ void **work;
+ unsigned int mss;
+ int grp;
+
+ frag_count = 0;
+ if (skb_has_frag_list(skb))
+ skb_walk_frags(skb, skb_tmp)
+ frag_count++;
+
+ /* Stop the queue if pko or sso are not keeping up */
+ oen = octeon3_eth_node + priv->node;
+ aq_cnt = oct_csr_read(SSO_GRP_AQ_CNT(oen->node, oen->tx_complete_grp));
+ aq_cnt &= GENMASK_ULL(32, 0);
+ backlog = atomic64_inc_return(&priv->tx_backlog);
+ if (unlikely(backlog > MAX_TX_QUEUE_DEPTH || aq_cnt > 100000))
+ netif_stop_queue(netdev);
+
+ /* We have space for 11 segment pointers, If there will be
+ * more than that, we must linearize. The count is: 1 (base
+ * SKB) + frag_count + nr_frags.
+ */
+ if (unlikely(skb_shinfo(skb)->nr_frags + frag_count > 10)) {
+ if (unlikely(__skb_linearize(skb)))
+ goto skip_xmit;
+ frag_count = 0;
+ }
What's so special about 10? The maximum the network stack could pass is
SKB_MAX_FRAGS, what would happen in that case?
+
+ work = (void **)skb->cb;
+ work[0] = netdev;
+ work[1] = NULL;
+
+ /* Adjust the port statistics. */
+ atomic64_inc(&priv->tx_packets);
+ atomic64_add(skb->len, &priv->tx_octets);
Do this in the TX completion worker, there is no guarantee the packet
will be transmitted that early in this function.
+
+ /* Make sure packet data writes are committed before
+ * submitting the command below
+ */
+ wmb();
That seems just wrong here, if your goal is to make sure that e.g:
skb_linearize() did finish its pending writes to DRAM, you need to use
DMA-API towards that goal. If the device is cache coherent, DMA-API will
know that and do nothing.
+
+ /* Build the pko command */
+ pko_send_desc = build_pko_send_hdr_desc(skb);
+ preempt_disable();
Why do you disable preemption here?
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+
+ /* Request packet to be ptp timestamped */
+ if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
+ unlikely(priv->tx_timestamp_hw)) {
+ pko_send_desc = build_pko_send_ext_desc(skb);
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+ }
+
+ /* Add the tso descriptor if needed */
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(mss)) {
+ pko_send_desc = build_pko_send_tso(skb, netdev->mtu);
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+ }
+
+ /* Add a gather descriptor for each segment. See PKO_SEND_GATHER_S for
+ * the send gather descriptor format.
+ */
+ pko_send_desc = 0;
+ pko_send_desc |= (u64)PKO_SENDSUBDC_GATHER << 45;
+ head_len = skb_headlen(skb);
+ if (head_len > 0) {
+ pko_send_desc |= head_len << 48;
+ pko_send_desc |= virt_to_phys(skb->data);
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+ }
+ for (i = 1; i <= skb_shinfo(skb)->nr_frags; i++) {
+ struct skb_frag_struct *fs = skb_shinfo(skb)->frags + i - 1;
+
+ pko_send_desc &= ~(GENMASK_ULL(63, 48) | GENMASK_ULL(41, 0));
+ pko_send_desc |= (u64)fs->size << 48;
+ pko_send_desc |= virt_to_phys((u8 *)page_address(fs->page.p) + fs->page_offset);
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+ }
+ skb_walk_frags(skb, skb_tmp) {
+ pko_send_desc &= ~(GENMASK_ULL(63, 48) | GENMASK_ULL(41, 0));
+ pko_send_desc |= (u64)skb_tmp->len << 48;
+ pko_send_desc |= virt_to_phys(skb_tmp->data);
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+ }
+
+ /* Subtract 1 from the tx_backlog. */
+ pko_send_desc = build_pko_send_mem_sub(virt_to_phys(&priv->tx_backlog));
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+
+ /* Write the ptp timestamp in the skb itself */
+ if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
+ unlikely(priv->tx_timestamp_hw)) {
+ pko_send_desc = build_pko_send_mem_ts(virt_to_phys(&work[1]));
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+ }
+
+ /* Send work when finished with the packet. */
+ grp = octeon3_eth_lgrp_to_ggrp(priv->node, priv->tx_complete_grp);
+ pko_send_desc = build_pko_send_work(grp, virt_to_phys(work));
+ scratch_write64(scr_off, pko_send_desc);
+ scr_off += sizeof(pko_send_desc);
+
+ /* See PKO_SEND_DMA_S in the HRM for the lmtdam data format */
+ lmtdma_data = 0;
+ lmtdma_data |= (u64)(LMTDMA_SCR_OFFSET >> 3) << 56;
+ if (wait_pko_response)
+ lmtdma_data |= 1ull << 48;
+ lmtdma_data |= 0x51ull << 40;
+ lmtdma_data |= (u64)priv->node << 36;
+ lmtdma_data |= priv->pko_queue << 16;
+
+ dma_addr = (u64 *)(LMTDMA_ORDERED_IO_ADDR | ((scr_off & 0x78) - 8));
+ *dma_addr = lmtdma_data;
+
+ preempt_enable();
+
+ if (wait_pko_response) {
+ u64 query_rtn;
+
+ CVMX_SYNCIOBDMA;
+
+ /* See PKO_QUERY_RTN_S in the HRM for the return format */
+ query_rtn = scratch_read64(LMTDMA_SCR_OFFSET);
+ query_rtn >>= 60;
+ if (unlikely(query_rtn != PKO_DQSTATUS_PASS)) {
+ netdev_err(netdev, "PKO enqueue failed %llx\n",
+ (unsigned long long)query_rtn);
+ dev_kfree_skb_any(skb);
+ }
+ }
So I am not sure I fully understand how sending packets works, although
it seems to be like you are building a work element (pko_send_desc)
which references either a full-size Ethernet frame, or that frame and
its fragments (multiple pko_send_desc). In that case, I don't see why
you can't juse dma_map_single()/dma_unmap_single() against skb->data and
its potential fragments instead of using virt_to_phys() like you
currently do, which is absolutely not portable.
dma_map_single() on the kernel linear address space should be equivalent
to virt_to_phys() anyway, and you would get the nice things about
DMA-API like its portability.
I could imagine that, for coherency purposes, there may be a requirement
to keep tskb->data and frieds to be within XKSEG0/1, if that's the case,
DMA-API should know that too.
I might be completely off, but using virt_to_phys() sure does sound non
portable here.
+
+ return NETDEV_TX_OK;
+skip_xmit:
+ atomic64_inc(&priv->tx_dropped);
+ dev_kfree_skb_any(skb);
+ return NETDEV_TX_OK;
+}
+
+static void octeon3_eth_ndo_get_stats64(struct net_device *netdev,
+ struct rtnl_link_stats64 *s)
+{
+ struct octeon3_ethernet *priv = netdev_priv(netdev);
+ u64 packets, octets, dropped;
+ u64 delta_packets, delta_octets, delta_dropped;
+
+ spin_lock(&priv->stat_lock);
Consider using u64_stats_sync to get rid of this lock...
+
+ octeon3_pki_get_stats(priv->node, priv->pknd, &packets, &octets, &dropped);
+
+ delta_packets = (packets - priv->last_packets) & ((1ull << 48) - 1);
+ delta_octets = (octets - priv->last_octets) & ((1ull << 48) - 1);
+ delta_dropped = (dropped - priv->last_dropped) & ((1ull << 48) - 1);
+
+ priv->last_packets = packets;
+ priv->last_octets = octets;
+ priv->last_dropped = dropped;
+
+ spin_unlock(&priv->stat_lock);
+
+ atomic64_add(delta_packets, &priv->rx_packets);
+ atomic64_add(delta_octets, &priv->rx_octets);
+ atomic64_add(delta_dropped, &priv->rx_dropped);
and summing up these things as well.
+
+ s->rx_packets = atomic64_read(&priv->rx_packets);
+ s->rx_bytes = atomic64_read(&priv->rx_octets);
+ s->rx_dropped = atomic64_read(&priv->rx_dropped);
+ s->rx_errors = atomic64_read(&priv->rx_errors);
+ s->rx_length_errors = atomic64_read(&priv->rx_length_errors);
+ s->rx_crc_errors = atomic64_read(&priv->rx_crc_errors);
+
+ s->tx_packets = atomic64_read(&priv->tx_packets);
+ s->tx_bytes = atomic64_read(&priv->tx_octets);
+ s->tx_dropped = atomic64_read(&priv->tx_dropped);
+}
[snip]
+enum port_mode {
+ PORT_MODE_DISABLED,
+ PORT_MODE_SGMII,
+ PORT_MODE_RGMII,
+ PORT_MODE_XAUI,
+ PORT_MODE_RXAUI,
+ PORT_MODE_XLAUI,
+ PORT_MODE_XFI,
+ PORT_MODE_10G_KR,
+ PORT_MODE_40G_KR4
+};
Can you use phy_interface_t values for this?
+
+enum lane_mode {
+ R_25G_REFCLK100,
+ R_5G_REFCLK100,
+ R_8G_REFCLK100,
+ R_125G_REFCLK15625_KX,
+ R_3125G_REFCLK15625_XAUI,
+ R_103125G_REFCLK15625_KR,
+ R_125G_REFCLK15625_SGMII,
+ R_5G_REFCLK15625_QSGMII,
+ R_625G_REFCLK15625_RXAUI,
+ R_25G_REFCLK125,
+ R_5G_REFCLK125,
+ R_8G_REFCLK125
+};
+
+struct port_status {
+ int link;
+ int duplex;
+ int speed;
+};
Likewise, using phy_device would give you this information.