diff --git a/kernel/file_ops.c b/kernel/file_ops.c new file mode 100644 index 00000000..f16cc995 --- /dev/null +++ b/kernel/file_ops.c @@ -0,0 +1,1558 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "pio.h" +#include "device.h" +#include "common.h" +#include "trace.h" +#include "user_sdma.h" +#include "user_exp_rcv.h" +#include "aspm.h" +#include "mmu_rb.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */ + +/* + * File operation functions + */ +static int hfi1_file_open(struct inode *, struct file *); +static int hfi1_file_close(struct inode *, struct file *); +static ssize_t hfi1_aio_write(struct kiocb *kiocb, const struct iovec *iovec, + unsigned long dim, loff_t offset); +static unsigned int hfi1_poll(struct file *, struct poll_table_struct *); +static int hfi1_file_mmap(struct file *, struct vm_area_struct *); + +static u64 kvirt_to_phys(void *); +static int assign_ctxt(struct file *, struct hfi1_user_info *); +static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *); +static int user_init(struct file *); +static int get_ctxt_info(struct file *, void __user *, __u32); +static int get_base_info(struct file *, void __user *, __u32); +static int setup_ctxt(struct file *); +static int setup_subctxt(struct hfi1_ctxtdata *); +static int get_user_context(struct file *, struct hfi1_user_info *, int); +static int find_shared_ctxt(struct file *, const struct hfi1_user_info *); +static int allocate_ctxt(struct file *, struct hfi1_devdata *, + struct hfi1_user_info *); +static unsigned int poll_urgent(struct file *, struct poll_table_struct *); +static unsigned int poll_next(struct file *, struct poll_table_struct *); +static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long); +static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); +static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); +static int vma_fault(struct vm_area_struct *, struct vm_fault *); +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg); + +static const struct file_operations hfi1_file_ops = { + .owner = THIS_MODULE, + .aio_write = hfi1_aio_write, + .open = hfi1_file_open, + .release = hfi1_file_close, + .unlocked_ioctl = hfi1_file_ioctl, + .poll = hfi1_poll, + .mmap = hfi1_file_mmap, + .llseek = noop_llseek, +}; + +static struct vm_operations_struct vm_ops = { + .fault = vma_fault, +}; + +/* + * Types of memories mapped into user processes' space + */ +enum mmap_types { + PIO_BUFS = 1, + PIO_BUFS_SOP, + PIO_CRED, + RCV_HDRQ, + RCV_EGRBUF, + UREGS, + EVENTS, + STATUS, + RTAIL, + SUBCTXT_UREGS, + SUBCTXT_RCV_HDRQ, + SUBCTXT_EGRBUF, + SDMA_COMP +}; + +/* + * Masks and offsets defining the mmap tokens + */ +#define HFI1_MMAP_OFFSET_MASK 0xfffULL +#define HFI1_MMAP_OFFSET_SHIFT 0 +#define HFI1_MMAP_SUBCTXT_MASK 0xfULL +#define HFI1_MMAP_SUBCTXT_SHIFT 12 +#define HFI1_MMAP_CTXT_MASK 0xffULL +#define HFI1_MMAP_CTXT_SHIFT 16 +#define HFI1_MMAP_TYPE_MASK 0xfULL +#define HFI1_MMAP_TYPE_SHIFT 24 +#define HFI1_MMAP_MAGIC_MASK 0xffffffffULL +#define HFI1_MMAP_MAGIC_SHIFT 32 + +#define HFI1_MMAP_MAGIC 0xdabbad00 + +#define HFI1_MMAP_TOKEN_SET(field, val) \ + (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT) +#define HFI1_MMAP_TOKEN_GET(field, token) \ + (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK) +#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr) \ + (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \ + HFI1_MMAP_TOKEN_SET(TYPE, type) | \ + HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \ + HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \ + HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr)))) + +#define dbg(fmt, ...) \ + pr_info(fmt, ##__VA_ARGS__) + +static inline int is_valid_mmap(u64 token) +{ + return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC); +} + +static int hfi1_file_open(struct inode *inode, struct file *fp) +{ + struct hfi1_filedata *fd; + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); + + if (!atomic_inc_not_zero(&dd->user_refcount)) + return -ENXIO; + + /* Just take a ref now. Not all opens result in a context assign */ + kobject_get(&dd->kobj); + + /* The real work is performed later in assign_ctxt() */ + + fd = kzalloc(sizeof(*fd), GFP_KERNEL); + + if (fd) { + fd->rec_cpu_num = -1; /* no cpu affinity by default */ + fd->mm = current->mm; + atomic_inc(&fd->mm->mm_count); + fp->private_data = fd; + } else { + fp->private_data = NULL; + + if (atomic_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + + return -ENOMEM; + } + + return 0; +} + +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_user_info uinfo; + struct hfi1_tid_info tinfo; + int ret = 0; + unsigned long addr; + int uval = 0; + unsigned long ul_uval = 0; + u16 uval16 = 0; + + hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd); + if (cmd != HFI1_IOCTL_ASSIGN_CTXT && + cmd != HFI1_IOCTL_GET_VERS && + !uctxt) + return -EINVAL; + + switch (cmd) { + case HFI1_IOCTL_ASSIGN_CTXT: + if (uctxt) + return -EINVAL; + + if (copy_from_user(&uinfo, + (struct hfi1_user_info __user *)arg, + sizeof(uinfo))) + return -EFAULT; + + ret = assign_ctxt(fp, &uinfo); + if (ret < 0) + return ret; + ret = setup_ctxt(fp); + if (ret) + return ret; + ret = user_init(fp); + break; + case HFI1_IOCTL_CTXT_INFO: + ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg, + sizeof(struct hfi1_ctxt_info)); + break; + case HFI1_IOCTL_USER_INFO: + ret = get_base_info(fp, (void __user *)(unsigned long)arg, + sizeof(struct hfi1_base_info)); + break; + case HFI1_IOCTL_CREDIT_UPD: + if (uctxt) + sc_return_credits(uctxt->sc); + break; + + case HFI1_IOCTL_TID_UPDATE: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_setup(fp, &tinfo); + if (!ret) { + /* + * Copy the number of tidlist entries we used + * and the length of the buffer we registered. + * These fields are adjacent in the structure so + * we can copy them at the same time. + */ + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt) + + sizeof(tinfo.length))) + ret = -EFAULT; + } + break; + + case HFI1_IOCTL_TID_FREE: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_clear(fp, &tinfo); + if (ret) + break; + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; + break; + + case HFI1_IOCTL_TID_INVAL_READ: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_invalid(fp, &tinfo); + if (ret) + break; + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; + break; + + case HFI1_IOCTL_RECV_CTRL: + ret = get_user(uval, (int __user *)arg); + if (ret != 0) + return -EFAULT; + ret = manage_rcvq(uctxt, fd->subctxt, uval); + break; + + case HFI1_IOCTL_POLL_TYPE: + ret = get_user(uval, (int __user *)arg); + if (ret != 0) + return -EFAULT; + uctxt->poll_type = (typeof(uctxt->poll_type))uval; + break; + + case HFI1_IOCTL_ACK_EVENT: + ret = get_user(ul_uval, (unsigned long __user *)arg); + if (ret != 0) + return -EFAULT; + ret = user_event_ack(uctxt, fd->subctxt, ul_uval); + break; + + case HFI1_IOCTL_SET_PKEY: + ret = get_user(uval16, (u16 __user *)arg); + if (ret != 0) + return -EFAULT; + if (HFI1_CAP_IS_USET(PKEY_CHECK)) + ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16); + else + return -EPERM; + break; + + case HFI1_IOCTL_CTXT_RESET: { + struct send_context *sc; + struct hfi1_devdata *dd; + + if (!uctxt || !uctxt->dd || !uctxt->sc) + return -EINVAL; + + /* + * There is no protection here. User level has to + * guarantee that no one will be writing to the send + * context while it is being re-initialized. + * If user level breaks that guarantee, it will break + * it's own context and no one else's. + */ + dd = uctxt->dd; + sc = uctxt->sc; + /* + * Wait until the interrupt handler has marked the + * context as halted or frozen. Report error if we time + * out. + */ + wait_event_interruptible_timeout( + sc->halt_wait, (sc->flags & SCF_HALTED), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (!(sc->flags & SCF_HALTED)) + return -ENOLCK; + + /* + * If the send context was halted due to a Freeze, + * wait until the device has been "unfrozen" before + * resetting the context. + */ + if (sc->flags & SCF_FROZEN) { + wait_event_interruptible_timeout( + dd->event_queue, + !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (dd->flags & HFI1_FROZEN) + return -ENOLCK; + + if (dd->flags & HFI1_FORCED_FREEZE) + /* + * Don't allow context reset if we are into + * forced freeze + */ + return -ENODEV; + + sc_disable(sc); + ret = sc_enable(sc); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, + uctxt->ctxt); + } else { + ret = sc_restart(sc); + } + if (!ret) + sc_return_credits(sc); + break; + } + + case HFI1_IOCTL_GET_VERS: + uval = HFI1_USER_SWVERSION; + if (put_user(uval, (int __user *)arg)) + return -EFAULT; + break; + + default: + return -EINVAL; + } + + return ret; +} + +static ssize_t hfi1_aio_write(struct kiocb *kiocb, const struct iovec *iovec, + unsigned long dim, loff_t offset) +{ + struct hfi1_filedata *fd = kiocb->ki_filp->private_data; + struct hfi1_user_sdma_pkt_q *pq = fd->pq; + struct hfi1_user_sdma_comp_q *cq = fd->cq; + int done = 0, reqs = 0; + + if (!cq || !pq) + return -EIO; + + if (!dim) + return -EINVAL; + + hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)", + fd->uctxt->ctxt, fd->subctxt, dim); + + if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) + return -ENOSPC; + + while (dim) { + int ret; + unsigned long count = 0; + + ret = hfi1_user_sdma_process_request( + kiocb->ki_filp, (struct iovec *)(iovec + done), + dim, &count); + if (ret) { + reqs = ret; + break; + } + dim -= count; + done += count; + reqs++; + } + + return reqs; +} + +static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd; + unsigned long flags; + u64 token = vma->vm_pgoff << PAGE_SHIFT, + memaddr = 0; + void *memvirt = NULL; + u8 subctxt, mapio = 0, vmf = 0, type; + ssize_t memlen = 0; + int ret = 0; + u16 ctxt; + + if (!is_valid_mmap(token) || !uctxt || + !(vma->vm_flags & VM_SHARED)) { + ret = -EINVAL; + goto done; + } + dd = uctxt->dd; + ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token); + subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token); + type = HFI1_MMAP_TOKEN_GET(TYPE, token); + if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) { + ret = -EINVAL; + goto done; + } + + flags = vma->vm_flags; + + switch (type) { + case PIO_BUFS: + case PIO_BUFS_SOP: + memaddr = ((dd->physaddr + TXE_PIO_SEND) + + /* chip pio base */ + (uctxt->sc->hw_context * BIT(16))) + + /* 64K PIO space / ctxt */ + (type == PIO_BUFS_SOP ? + (TXE_PIO_SIZE / 2) : 0); /* sop? */ + /* + * Map only the amount allocated to the context, not the + * entire available context's PIO space. + */ + memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE); + flags &= ~VM_MAYREAD; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + mapio = 1; + break; + case PIO_CRED: + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + /* + * The credit return location for this context could be on the + * second or third page allocated for credit returns (if number + * of enabled contexts > 64 and 128 respectively). + */ + memvirt = dd->cr_base[uctxt->numa_id].va; + memaddr = virt_to_phys(memvirt) + + (((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK); + memlen = PAGE_SIZE; + flags &= ~VM_MAYWRITE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + /* + * The driver has already allocated memory for credit + * returns and programmed it into the chip. Has that + * memory been flagged as non-cached? + */ + /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */ + mapio = 1; + break; + case RCV_HDRQ: + memlen = uctxt->rcvhdrq_size; + memvirt = uctxt->rcvhdrq; + break; + case RCV_EGRBUF: { + unsigned long addr; + int i; + /* + * The RcvEgr buffer need to be handled differently + * as multiple non-contiguous pages need to be mapped + * into the user process. + */ + memlen = uctxt->egrbufs.size; + if ((vma->vm_end - vma->vm_start) != memlen) { + dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n", + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + if (vma->vm_flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + vma->vm_flags &= ~VM_MAYWRITE; + addr = vma->vm_start; + for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { + memlen = uctxt->egrbufs.buffers[i].len; + memvirt = uctxt->egrbufs.buffers[i].addr; + ret = remap_pfn_range( + vma, addr, + /* + * virt_to_pfn() does the same, but + * it's not available on x86_64 + * when CONFIG_MMU is enabled. + */ + PFN_DOWN(__pa(memvirt)), + memlen, + vma->vm_page_prot); + if (ret < 0) + goto done; + addr += memlen; + } + ret = 0; + goto done; + } + case UREGS: + /* + * Map only the page that contains this context's user + * registers. + */ + memaddr = (unsigned long) + (dd->physaddr + RXE_PER_CONTEXT_USER) + + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE); + /* + * TidFlow table is on the same page as the rest of the + * user registers. + */ + memlen = PAGE_SIZE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + mapio = 1; + break; + case EVENTS: + /* + * Use the page where this context's flags are. User level + * knows where it's own bitmap is within the page. + */ + memaddr = (unsigned long)(dd->events + + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK; + memlen = PAGE_SIZE; + /* + * v3.7 removes VM_RESERVED but the effect is kept by + * using VM_IO. + */ + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case STATUS: + memaddr = kvirt_to_phys((void *)dd->status); + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + break; + case RTAIL: + if (!HFI1_CAP_IS_USET(DMA_RTAIL)) { + /* + * If the memory allocation failed, the context alloc + * also would have failed, so we would never get here + */ + ret = -EINVAL; + goto done; + } + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + memlen = PAGE_SIZE; + memvirt = (void *)uctxt->rcvhdrtail_kvaddr; + flags &= ~VM_MAYWRITE; + break; + case SUBCTXT_UREGS: + memaddr = (u64)uctxt->subctxt_uregbase; + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_RCV_HDRQ: + memaddr = (u64)uctxt->subctxt_rcvhdr_base; + memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_EGRBUF: + memaddr = (u64)uctxt->subctxt_rcvegrbuf; + memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + flags &= ~VM_MAYWRITE; + vmf = 1; + break; + case SDMA_COMP: { + struct hfi1_user_sdma_comp_q *cq = fd->cq; + + if (!cq) { + ret = -EFAULT; + goto done; + } + memaddr = (u64)cq->comps; + memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries); + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + } + default: + ret = -EINVAL; + break; + } + + if ((vma->vm_end - vma->vm_start) != memlen) { + hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu", + uctxt->ctxt, fd->subctxt, + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + + vma->vm_flags = flags; + hfi1_cdbg(PROC, + "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", + ctxt, subctxt, type, mapio, vmf, memaddr, memlen, + vma->vm_end - vma->vm_start, vma->vm_flags); + if (vmf) { + vma->vm_pgoff = PFN_DOWN(memaddr); + vma->vm_ops = &vm_ops; + ret = 0; + } else if (mapio) { + ret = io_remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(memaddr), + memlen, + vma->vm_page_prot); + } else if (memvirt) { + ret = remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(__pa(memvirt)), + memlen, + vma->vm_page_prot); + } else { + ret = remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(memaddr), + memlen, + vma->vm_page_prot); + } +done: + return ret; +} + +/* + * Local (non-chip) user memory is not mapped right away but as it is + * accessed by the user-level code. + */ +static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + + get_page(page); + vmf->page = page; + + return 0; +} + +static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt) +{ + struct hfi1_ctxtdata *uctxt; + unsigned pollflag; + + uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt; + if (!uctxt) + pollflag = POLLERR; + else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT) + pollflag = poll_urgent(fp, pt); + else if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV) + pollflag = poll_next(fp, pt); + else /* invalid */ + pollflag = POLLERR; + + return pollflag; +} + +static int hfi1_file_close(struct inode *inode, struct file *fp) +{ + struct hfi1_filedata *fdata = fp->private_data; + struct hfi1_ctxtdata *uctxt = fdata->uctxt; + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); + unsigned long flags, *ev; + + fp->private_data = NULL; + + if (!uctxt) + goto done; + + hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); + mutex_lock(&hfi1_mutex); + + flush_wc(); + /* drain user sdma queue */ + hfi1_user_sdma_free_queues(fdata); + + /* release the cpu */ + hfi1_put_proc_affinity(fdata->rec_cpu_num); + + /* + * Clear any left over, unhandled events so the next process that + * gets this context doesn't get confused. + */ + ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fdata->subctxt; + *ev = 0; + + if (--uctxt->cnt) { + uctxt->active_slaves &= ~(1 << fdata->subctxt); + mutex_unlock(&hfi1_mutex); + goto done; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + /* + * Disable receive context and interrupt available, reset all + * RcvCtxtCtrl bits to default values. + */ + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_TIDFLOW_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS | + HFI1_RCVCTRL_NO_RHQ_DROP_DIS | + HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt); + /* Clear the context's J_KEY */ + hfi1_clear_ctxt_jkey(dd, uctxt->ctxt); + /* + * Reset context integrity checks to default. + * (writes to CSRs probably belong in chip.c) + */ + write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE, + hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type)); + sc_disable(uctxt->sc); + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + dd->rcd[uctxt->ctxt] = NULL; + + hfi1_user_exp_rcv_free(fdata); + hfi1_clear_ctxt_pkey(dd, uctxt->ctxt); + + uctxt->rcvwait_to = 0; + uctxt->piowait_to = 0; + uctxt->rcvnowait = 0; + uctxt->pionowait = 0; + uctxt->event_flags = 0; + + hfi1_stats.sps_ctxts--; + if (++dd->freectxts == dd->num_user_contexts) + aspm_enable_all(dd); + mutex_unlock(&hfi1_mutex); + hfi1_free_ctxtdata(dd, uctxt); +done: + mmdrop(fdata->mm); + kobject_put(&dd->kobj); + + if (atomic_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + + kfree(fdata); + return 0; +} + +/* + * Convert kernel *virtual* addresses to physical addresses. + * This is used to vmalloc'ed addresses. + */ +static u64 kvirt_to_phys(void *addr) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(addr); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) +{ + int i_minor, ret = 0; + unsigned int swmajor, swminor; + + swmajor = uinfo->userversion >> 16; + if (swmajor != HFI1_USER_SWMAJOR) { + ret = -ENODEV; + goto done; + } + + swminor = uinfo->userversion & 0xffff; + + mutex_lock(&hfi1_mutex); + /* First, lets check if we need to setup a shared context? */ + if (uinfo->subctxt_cnt) { + struct hfi1_filedata *fd = fp->private_data; + + ret = find_shared_ctxt(fp, uinfo); + if (ret < 0) + goto done_unlock; + if (ret) { + fd->rec_cpu_num = + hfi1_get_proc_affinity(fd->uctxt->numa_id); + } + } + + /* + * We execute the following block if we couldn't find a + * shared context or if context sharing is not required. + */ + if (!ret) { + i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE; + ret = get_user_context(fp, uinfo, i_minor); + } +done_unlock: + mutex_unlock(&hfi1_mutex); +done: + return ret; +} + +static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo, + int devno) +{ + struct hfi1_devdata *dd = NULL; + int devmax, npresent, nup; + + devmax = hfi1_count_units(&npresent, &nup); + if (!npresent) + return -ENXIO; + + if (!nup) + return -ENETDOWN; + + dd = hfi1_lookup(devno); + if (!dd) + return -ENODEV; + else if (!dd->freectxts) + return -EBUSY; + + return allocate_ctxt(fp, dd, uinfo); +} + +static int find_shared_ctxt(struct file *fp, + const struct hfi1_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + struct hfi1_filedata *fd = fp->private_data; + + devmax = hfi1_count_units(NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct hfi1_devdata *dd = hfi1_lookup(ndev); + + if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase)) + continue; + for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) { + struct hfi1_ctxtdata *uctxt = dd->rcd[i]; + + /* Skip ctxts which are not yet open */ + if (!uctxt || !uctxt->cnt) + continue; + /* Skip ctxt if it doesn't match the requested one */ + if (memcmp(uctxt->uuid, uinfo->uuid, + sizeof(uctxt->uuid)) || + uctxt->jkey != generate_jkey(current_uid()) || + uctxt->subctxt_id != uinfo->subctxt_id || + uctxt->subctxt_cnt != uinfo->subctxt_cnt) + continue; + + /* Verify the sharing process matches the master */ + if (uctxt->userversion != uinfo->userversion || + uctxt->cnt >= uctxt->subctxt_cnt) { + ret = -EINVAL; + goto done; + } + fd->uctxt = uctxt; + fd->subctxt = uctxt->cnt++; + uctxt->active_slaves |= 1 << fd->subctxt; + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, + struct hfi1_user_info *uinfo) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt; + unsigned ctxt; + int ret, numa; + + if (dd->flags & HFI1_FROZEN) { + /* + * Pick an error that is unique from all other errors + * that are returned so the user process knows that + * it tried to allocate while the SPC was frozen. It + * it should be able to retry with success in a short + * while. + */ + return -EIO; + } + + for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++) + if (!dd->rcd[ctxt]) + break; + + if (ctxt == dd->num_rcv_contexts) + return -EBUSY; + + /* + * If we don't have a NUMA node requested, preference is towards + * device NUMA node. + */ + fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node); + if (fd->rec_cpu_num != -1) + numa = cpu_to_node(fd->rec_cpu_num); + else + numa = numa_node_id(); + uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa); + if (!uctxt) { + dd_dev_err(dd, + "Unable to allocate ctxtdata memory, failing open\n"); + return -ENOMEM; + } + hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)", + uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num, + uctxt->numa_id); + + /* + * Allocate and enable a PIO send context. + */ + uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, + uctxt->dd->node); + if (!uctxt->sc) { + ret = -ENOMEM; + goto ctxdata_free; + } + hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index, + uctxt->sc->hw_context); + ret = sc_enable(uctxt->sc); + if (ret) + goto ctxdata_free; + + /* + * Setup shared context resources if the user-level has requested + * shared contexts and this is the 'master' process. + * This has to be done here so the rest of the sub-contexts find the + * proper master. + */ + if (uinfo->subctxt_cnt && !fd->subctxt) { + ret = init_subctxts(uctxt, uinfo); + /* + * On error, we don't need to disable and de-allocate the + * send context because it will be done during file close + */ + if (ret) + goto ctxdata_free; + } + uctxt->userversion = uinfo->userversion; + uctxt->flags = hfi1_cap_mask; /* save current flag state */ + init_waitqueue_head(&uctxt->wait); + strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); + memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)); + uctxt->jkey = generate_jkey(current_uid()); + INIT_LIST_HEAD(&uctxt->sdma_queues); + spin_lock_init(&uctxt->sdma_qlock); + hfi1_stats.sps_ctxts++; + /* + * Disable ASPM when there are open user/PSM contexts to avoid + * issues with ASPM L1 exit latency + */ + if (dd->freectxts-- == dd->num_user_contexts) + aspm_disable_all(dd); + fd->uctxt = uctxt; + + return 0; + +ctxdata_free: + dd->rcd[ctxt] = NULL; + hfi1_free_ctxtdata(dd, uctxt); + return ret; +} + +static int init_subctxts(struct hfi1_ctxtdata *uctxt, + const struct hfi1_user_info *uinfo) +{ + unsigned num_subctxts; + + num_subctxts = uinfo->subctxt_cnt; + if (num_subctxts > HFI1_MAX_SHARED_CTXTS) + return -EINVAL; + + uctxt->subctxt_cnt = uinfo->subctxt_cnt; + uctxt->subctxt_id = uinfo->subctxt_id; + uctxt->active_slaves = 1; + uctxt->redirect_seq_cnt = 1; + set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); + + return 0; +} + +static int setup_subctxt(struct hfi1_ctxtdata *uctxt) +{ + int ret = 0; + unsigned num_subctxts = uctxt->subctxt_cnt; + + uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE); + if (!uctxt->subctxt_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* We can take the size of the RcvHdr Queue from the master */ + uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size * + num_subctxts); + if (!uctxt->subctxt_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size * + num_subctxts); + if (!uctxt->subctxt_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + goto bail; +bail_rhdr: + vfree(uctxt->subctxt_rcvhdr_base); +bail_ureg: + vfree(uctxt->subctxt_uregbase); + uctxt->subctxt_uregbase = NULL; +bail: + return ret; +} + +static int user_init(struct file *fp) +{ + unsigned int rcvctrl_ops = 0; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + /* make sure that the context has already been setup */ + if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) + return -EFAULT; + + /* initialize poll variables... */ + uctxt->urgent = 0; + uctxt->urgent_poll = 0; + + /* + * Now enable the ctxt for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ctxts, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + if (uctxt->rcvhdrtail_kvaddr) + clear_rcvhdrtail(uctxt); + + /* Setup J_KEY before enabling the context */ + hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey); + + rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; + if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) + rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; + /* + * Ignore the bit in the flags for now until proper + * support for multiple packet per rcv array entry is + * added. + */ + if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + /* + * The RcvCtxtCtrl.TailUpd bit has to be explicitly written. + * We can't rely on the correct value to be set from prior + * uses of the chip or ctxt. Therefore, add the rcvctrl op + * for both cases. + */ + if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL)) + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; + else + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS; + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt); + + /* Notify any waiting slaves */ + if (uctxt->subctxt_cnt) { + clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } + + return 0; +} + +static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) +{ + struct hfi1_ctxt_info cinfo; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + int ret = 0; + + memset(&cinfo, 0, sizeof(cinfo)); + cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) & + HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_UGET_MASK(uctxt->flags, MASK) | + HFI1_CAP_KGET_MASK(uctxt->flags, K2U); + /* adjust flag if this fd is not able to cache */ + if (!fd->handler) + cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */ + + cinfo.num_active = hfi1_count_active_units(); + cinfo.unit = uctxt->dd->unit; + cinfo.ctxt = uctxt->ctxt; + cinfo.subctxt = fd->subctxt; + cinfo.rcvtids = roundup(uctxt->egrbufs.alloced, + uctxt->dd->rcv_entries.group_size) + + uctxt->expected_count; + cinfo.credits = uctxt->sc->credits; + cinfo.numa_node = uctxt->numa_id; + cinfo.rec_cpu = fd->rec_cpu_num; + cinfo.send_ctxt = uctxt->sc->hw_context; + + cinfo.egrtids = uctxt->egrbufs.alloced; + cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2; + cinfo.sdma_ring_size = fd->cq->nentries; + cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size; + + trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo); + if (copy_to_user(ubase, &cinfo, sizeof(cinfo))) + ret = -EFAULT; + + return ret; +} + +static int setup_ctxt(struct file *fp) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + int ret = 0; + + /* + * Context should be set up only once, including allocation and + * programming of eager buffers. This is done if context sharing + * is not requested or by the master process. + */ + if (!uctxt->subctxt_cnt || !fd->subctxt) { + ret = hfi1_init_ctxt(uctxt->sc); + if (ret) + goto done; + + /* Now allocate the RcvHdr queue and eager buffers. */ + ret = hfi1_create_rcvhdrq(dd, uctxt); + if (ret) + goto done; + ret = hfi1_setup_eagerbufs(uctxt); + if (ret) + goto done; + if (uctxt->subctxt_cnt && !fd->subctxt) { + ret = setup_subctxt(uctxt); + if (ret) + goto done; + } + } else { + ret = wait_event_interruptible(uctxt->wait, !test_bit( + HFI1_CTXT_MASTER_UNINIT, + &uctxt->event_flags)); + if (ret) + goto done; + } + + ret = hfi1_user_sdma_alloc_queues(uctxt, fp); + if (ret) + goto done; + /* + * Expected receive has to be setup for all processes (including + * shared contexts). However, it has to be done after the master + * context has been fully configured as it depends on the + * eager/expected split of the RcvArray entries. + * Setting it up here ensures that the subcontexts will be waiting + * (due to the above wait_event_interruptible() until the master + * is setup. + */ + ret = hfi1_user_exp_rcv_init(fp); + if (ret) + goto done; + + set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags); +done: + return ret; +} + +static int get_base_info(struct file *fp, void __user *ubase, __u32 len) +{ + struct hfi1_base_info binfo; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + ssize_t sz; + unsigned offset; + int ret = 0; + + trace_hfi1_uctxtdata(uctxt->dd, uctxt); + + memset(&binfo, 0, sizeof(binfo)); + binfo.hw_version = dd->revision; + binfo.sw_version = HFI1_KERN_SWVERSION; + binfo.bthqp = kdeth_qp; + binfo.jkey = uctxt->jkey; + /* + * If more than 64 contexts are enabled the allocated credit + * return will span two or three contiguous pages. Since we only + * map the page containing the context's credit return address, + * we need to calculate the offset in the proper page. + */ + offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE; + binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt, + fd->subctxt, offset); + binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt, + fd->subctxt, + uctxt->sc->base_addr); + binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP, + uctxt->ctxt, + fd->subctxt, + uctxt->sc->base_addr); + binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt, + fd->subctxt, + uctxt->rcvhdrq); + binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt, + fd->subctxt, + uctxt->egrbufs.rcvtids[0].dma); + binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt, + fd->subctxt, 0); + /* + * user regs are at + * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE)) + */ + binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt, + fd->subctxt, 0); + offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fd->subctxt) * + sizeof(*dd->events)); + binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt, + fd->subctxt, + offset); + binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt, + fd->subctxt, + dd->status); + if (HFI1_CAP_IS_USET(DMA_RTAIL)) + binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt, + fd->subctxt, 0); + if (uctxt->subctxt_cnt) { + binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS, + uctxt->ctxt, + fd->subctxt, 0); + binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ, + uctxt->ctxt, + fd->subctxt, 0); + binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF, + uctxt->ctxt, + fd->subctxt, 0); + } + sz = (len < sizeof(binfo)) ? len : sizeof(binfo); + if (copy_to_user(ubase, &binfo, sz)) + ret = -EFAULT; + return ret; +} + +static unsigned int poll_urgent(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (uctxt->urgent != uctxt->urgent_poll) { + pollflag = POLLIN | POLLRDNORM; + uctxt->urgent_poll = uctxt->urgent; + } else { + pollflag = 0; + set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags); + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static unsigned int poll_next(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (hdrqempty(uctxt)) { + set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt); + pollflag = 0; + } else { + pollflag = POLLIN | POLLRDNORM; + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +/* + * Find all user contexts in use, and set the specified bit in their + * event mask. + * See also find_ctxt() for a similar use, that is specific to send buffers. + */ +int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit) +{ + struct hfi1_ctxtdata *uctxt; + struct hfi1_devdata *dd = ppd->dd; + unsigned ctxt; + int ret = 0; + unsigned long flags; + + if (!dd->events) { + ret = -EINVAL; + goto done; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; + ctxt++) { + uctxt = dd->rcd[ctxt]; + if (uctxt) { + unsigned long *evs = dd->events + + (uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS; + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(evtbit, evs); + for (i = 1; i < uctxt->subctxt_cnt; i++) + set_bit(evtbit, evs + i); + } + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +done: + return ret; +} + +/** + * manage_rcvq - manage a context's receive queue + * @uctxt: the context + * @subctxt: the sub-context + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the context, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt, + int start_stop) +{ + struct hfi1_devdata *dd = uctxt->dd; + unsigned int rcvctrl_op; + + if (subctxt) + goto bail; + /* atomically clear receive enable ctxt. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. + */ + if (uctxt->rcvhdrtail_kvaddr) + clear_rcvhdrtail(uctxt); + rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB; + } else { + rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS; + } + hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt); + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +/* + * clear the event notifier events for this context. + * User process then performs actions appropriate to bit having been + * set, if desired, and checks again in future. + */ +static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt, + unsigned long events) +{ + int i; + struct hfi1_devdata *dd = uctxt->dd; + unsigned long *evs; + + if (!dd->events) + return 0; + + evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + subctxt; + + for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) { + if (!test_bit(i, &events)) + continue; + clear_bit(i, evs); + } + return 0; +} + +static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt, + u16 pkey) +{ + int ret = -ENOENT, i, intable = 0; + struct hfi1_pportdata *ppd = uctxt->ppd; + struct hfi1_devdata *dd = uctxt->dd; + + if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) { + ret = -EINVAL; + goto done; + } + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) + if (pkey == ppd->pkeys[i]) { + intable = 1; + break; + } + + if (intable) + ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey); +done: + return ret; +} + +static void user_remove(struct hfi1_devdata *dd) +{ + + hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device); +} + +static int user_add(struct hfi1_devdata *dd) +{ + char name[10]; + int ret; + + snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit); + ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops, + &dd->user_cdev, &dd->user_device, + true, &dd->kobj); + if (ret) + user_remove(dd); + + return ret; +} + +/* + * Create per-unit files in /dev + */ +int hfi1_device_create(struct hfi1_devdata *dd) +{ + int r, ret; + + r = user_add(dd); + ret = hfi1_diag_add(dd); + if (r && !ret) + ret = r; + return ret; +} + +/* + * Remove per-unit files in /dev + * void, core kernel returns no errors for this stuff + */ +void hfi1_device_remove(struct hfi1_devdata *dd) +{ + user_remove(dd); + hfi1_diag_remove(dd); +} diff --git a/kernel/include/hfi1/common.h b/kernel/include/hfi1/common.h new file mode 100644 index 00000000..2d22d906 --- /dev/null +++ b/kernel/include/hfi1/common.h @@ -0,0 +1,407 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _COMMON_H +#define _COMMON_H + +#include "update/hfi1_user.h" + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in + * fast path code + * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in fast path code + * _HFI1_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + */ + +/* + * If a packet's QP[23:16] bits match this value, then it is + * a PSM packet and the hardware will expect a KDETH header + * following the BTH. + */ +#define DEFAULT_KDETH_QP 0x80 + +/* driver/hw feature set bitmask */ +#define HFI1_CAP_USER_SHIFT 24 +#define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1) +/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */ +#define HFI1_CAP_LOCKED_SHIFT 63 +#define HFI1_CAP_LOCKED_MASK 0x1ULL +#define HFI1_CAP_LOCKED_SMASK (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT) +/* extra bits used between kernel and user processes */ +#define HFI1_CAP_MISC_SHIFT (HFI1_CAP_USER_SHIFT * 2) +#define HFI1_CAP_MISC_MASK ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \ + HFI1_CAP_MISC_SHIFT)) - 1) + +#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; }) +#define HFI1_CAP_KCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~HFI1_CAP_##cap; \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_USET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_UCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_SET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap << \ + HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_CLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap | \ + (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_LOCK() \ + ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; }) +#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK)) +/* + * The set of capability bits that can be changed after initial load + * This set is the same for kernel and user contexts. However, for + * user contexts, the set can be further filtered by using the + * HFI1_CAP_RESERVED_MASK bits. + */ +#define HFI1_CAP_WRITABLE_MASK (HFI1_CAP_SDMA_AHG | \ + HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_ALLOW_PERM_JKEY | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_TID_UNMAP | \ + HFI1_CAP_OPFN | \ + HFI1_CAP_TID_RDMA) +/* + * A set of capability bits that are "global" and are not allowed to be + * set in the user bitmask. + */ +#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \ + HFI1_CAP_USE_SDMA_HEAD | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_NO_INTEGRITY | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_TID_RDMA | \ + HFI1_CAP_OPFN) << \ + HFI1_CAP_USER_SHIFT) +/* + * Set of capabilities that need to be enabled for kernel context in + * order to be allowed for user contexts, as well. + */ +#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL) +/* Default enabled capabilities (both kernel and user) */ +#define HFI1_CAP_MASK_DEFAULT (HFI1_CAP_HDRSUPP | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_SDMA | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_EXTENDED_PSN | \ + ((HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_EARLY_CREDIT_RETURN) << \ + HFI1_CAP_USER_SHIFT)) +/* + * A bitmask of kernel/global capabilities that should be communicated + * to user level processes. + */ +#define HFI1_CAP_K2U (HFI1_CAP_SDMA | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_NO_INTEGRITY) + +#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \ + HFI1_USER_SWMINOR) + +#ifndef HFI1_KERN_TYPE +#define HFI1_KERN_TYPE 0 +#endif + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a Intel release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-Intel and 1 for Intel-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of hfi1_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION) + +/* + * Define the driver version number. This is something that refers only + * to the driver itself, not the software interfaces it supports. + */ +#ifndef HFI1_DRIVER_VERSION_BASE +#define HFI1_DRIVER_VERSION_BASE "0.9-294" +#endif + +/* create the final driver version string */ +#ifdef HFI1_IDSTR +#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR +#else +#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE +#endif + +/* + * Diagnostics can send a packet by writing the following + * struct to the diag packet special file. + * + * This allows a custom PBC qword, so that special modes and deliberate + * changes to CRCs can be used. + */ +#define _DIAG_PKT_VERS 1 +struct diag_pkt { + __u16 version; /* structure version */ + __u16 unit; /* which device */ + __u16 sw_index; /* send sw index to use */ + __u16 len; /* data length, in bytes */ + __u16 port; /* port number */ + __u16 unused; + __u32 flags; /* call flags */ + __u64 data; /* user data pointer */ + __u64 pbc; /* PBC for the packet */ +}; + +/* diag_pkt flags */ +#define F_DIAGPKT_WAIT 0x1 /* wait until packet is sent */ + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software. + */ + +/* + * Receive Header Flags + */ +#define RHF_PKT_LEN_SHIFT 0 +#define RHF_PKT_LEN_MASK 0xfffull +#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT) + +#define RHF_RCV_TYPE_SHIFT 12 +#define RHF_RCV_TYPE_MASK 0x7ull +#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT) + +#define RHF_USE_EGR_BFR_SHIFT 15 +#define RHF_USE_EGR_BFR_MASK 0x1ull +#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT) + +#define RHF_EGR_INDEX_SHIFT 16 +#define RHF_EGR_INDEX_MASK 0x7ffull +#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT) + +#define RHF_DC_INFO_SHIFT 27 +#define RHF_DC_INFO_MASK 0x1ull +#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT) + +#define RHF_RCV_SEQ_SHIFT 28 +#define RHF_RCV_SEQ_MASK 0xfull +#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT) + +#define RHF_EGR_OFFSET_SHIFT 32 +#define RHF_EGR_OFFSET_MASK 0xfffull +#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT) +#define RHF_HDRQ_OFFSET_SHIFT 44 +#define RHF_HDRQ_OFFSET_MASK 0x1ffull +#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT) +#define RHF_K_HDR_LEN_ERR (0x1ull << 53) +#define RHF_DC_UNC_ERR (0x1ull << 54) +#define RHF_DC_ERR (0x1ull << 55) +#define RHF_RCV_TYPE_ERR_SHIFT 56 +#define RHF_RCV_TYPE_ERR_MASK 0x7ul +#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT) +#define RHF_TID_ERR (0x1ull << 59) +#define RHF_LEN_ERR (0x1ull << 60) +#define RHF_ECC_ERR (0x1ull << 61) +#define RHF_VCRC_ERR (0x1ull << 62) +#define RHF_ICRC_ERR (0x1ull << 63) + +#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */ + +/* RHF receive types */ +#define RHF_RCV_TYPE_EXPECTED 0 +#define RHF_RCV_TYPE_EAGER 1 +#define RHF_RCV_TYPE_IB 2 /* normal IB, IB Raw, or IPv6 */ +#define RHF_RCV_TYPE_ERROR 3 +#define RHF_RCV_TYPE_BYPASS 4 +#define RHF_RCV_TYPE_INVALID5 5 +#define RHF_RCV_TYPE_INVALID6 6 +#define RHF_RCV_TYPE_INVALID7 7 + +/* RHF receive type error - expected packet errors */ +#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR 0x2 +#define RHF_RTE_EXPECTED_FLOW_GEN_ERR 0x4 + +/* RHF receive type error - eager packet errors */ +#define RHF_RTE_EAGER_NO_ERR 0x0 + +/* RHF receive type error - IB packet errors */ +#define RHF_RTE_IB_NO_ERR 0x0 + +/* RHF receive type error - error packet errors */ +#define RHF_RTE_ERROR_NO_ERR 0x0 +#define RHF_RTE_ERROR_OP_CODE_ERR 0x1 +#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2 +#define RHF_RTE_ERROR_KHDR_HCRC_ERR 0x3 +#define RHF_RTE_ERROR_KHDR_KVER_ERR 0x4 +#define RHF_RTE_ERROR_CONTEXT_ERR 0x5 +#define RHF_RTE_ERROR_KHDR_TID_ERR 0x6 + +/* RHF receive type error - bypass packet errors */ +#define RHF_RTE_BYPASS_NO_ERR 0x0 + +/* IB - LRH header constants */ +#define HFI1_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define LIM_MGMT_P_KEY 0x7FFF +#define FULL_MGMT_P_KEY 0xFFFF + +#define DEFAULT_P_KEY LIM_MGMT_P_KEY +#define HFI1_FECN_SHIFT 31 +#define HFI1_FECN_MASK 1 +#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT) +#define HFI1_BECN_SHIFT 30 +#define HFI1_BECN_MASK 1 +#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT) + +#define HFI1_PSM_IOC_BASE_SEQ 0x0 + +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define HFI1_KDETH_BTH_SEQ_SHIFT 11 +#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1) + +static inline __u64 rhf_to_cpu(const __le32 *rbuf) +{ + return __le64_to_cpu(*((__le64 *)rbuf)); +} + +static inline u64 rhf_err_flags(u64 rhf) +{ + return rhf & RHF_ERROR_SMASK; +} + +static inline u32 rhf_rcv_type(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK; +} + +static inline u32 rhf_rcv_type_err(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK; +} + +/* return size is in bytes, not DWORDs */ +static inline u32 rhf_pkt_len(u64 rhf) +{ + return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2; +} + +static inline u32 rhf_egr_index(u64 rhf) +{ + return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK; +} + +static inline u32 rhf_rcv_seq(u64 rhf) +{ + return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK; +} + +/* returned offset is in DWORDS */ +static inline u32 rhf_hdrq_offset(u64 rhf) +{ + return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK; +} + +static inline u64 rhf_use_egr_bfr(u64 rhf) +{ + return rhf & RHF_USE_EGR_BFR_SMASK; +} + +static inline u64 rhf_dc_info(u64 rhf) +{ + return rhf & RHF_DC_INFO_SMASK; +} + +static inline u32 rhf_egr_buf_offset(u64 rhf) +{ + return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK; +} +#endif /* _COMMON_H */ diff --git a/kernel/include/hfi1/hfi.h b/kernel/include/hfi1/hfi.h new file mode 100644 index 00000000..66cbd778 --- /dev/null +++ b/kernel/include/hfi1/hfi.h @@ -0,0 +1,2152 @@ +#ifndef _HFI1_KERNEL_H +#define _HFI1_KERNEL_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "chip_registers.h" +#include "common.h" +#include "opfn.h" +#include "verbs.h" +#include "pio.h" +#include "chip.h" +#include "mad.h" +#include "qsfp.h" +#include "platform.h" +#include "affinity.h" + +/* bumped 1 from s/w major version of TrueScale */ +#define HFI1_CHIP_VERS_MAJ 3U + +/* don't care about this except printing */ +#define HFI1_CHIP_VERS_MIN 0U + +/* The Organization Unique Identifier (Mfg code), and its position in GUID */ +#define HFI1_OUI 0x001175 +#define HFI1_OUI_LSB 40 + +#define DROP_PACKET_OFF 0 +#define DROP_PACKET_ON 1 + +extern unsigned long hfi1_cap_mask; +#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) +#define HFI1_CAP_UGET_MASK(mask, cap) \ + (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap) +#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap)) +#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap)) +#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \ + HFI1_CAP_MISC_MASK) +/* Offline Disabled Reason is 4-bits */ +#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON) + +/* + * Control context is always 0 and handles the error packets. + * It also handles the VL15 and multicast packets. + */ +#define HFI1_CTRL_CTXT 0 + +/* + * Driver context will store software counters for each of the events + * associated with these status registers + */ +#define NUM_CCE_ERR_STATUS_COUNTERS 41 +#define NUM_RCV_ERR_STATUS_COUNTERS 64 +#define NUM_MISC_ERR_STATUS_COUNTERS 13 +#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36 +#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4 +#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64 +#define NUM_SEND_ERR_STATUS_COUNTERS 3 +#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5 +#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24 + +/* + * per driver stats, either not device nor port-specific, or + * summed over all of the devices and ports. + * They are described by name via ipathfs filesystem, so layout + * and number of elements can change without breaking compatibility. + * If members are added or deleted hfi1_statnames[] in debugfs.c must + * change to match. + */ +struct hfi1_ib_stats { + __u64 sps_ints; /* number of interrupts handled */ + __u64 sps_errints; /* number of error interrupts */ + __u64 sps_txerrs; /* tx-related packet errors */ + __u64 sps_rcverrs; /* non-crc rcv packet errors */ + __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */ + __u64 sps_nopiobufs; /* no pio bufs avail from kernel */ + __u64 sps_ctxts; /* number of contexts currently open */ + __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */ + __u64 sps_buffull; + __u64 sps_hdrfull; +}; + +extern struct hfi1_ib_stats hfi1_stats; +extern const struct pci_error_handlers hfi1_pci_err_handler; + +/* + * First-cut criterion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000) + +/* + * Below contains all data related to a single context (formerly called port). + */ + +#ifdef CONFIG_DEBUG_FS +struct hfi1_opcode_stats_perctx; +#endif + +struct ctxt_eager_bufs { + ssize_t size; /* total size of eager buffers */ + u32 count; /* size of buffers array */ + u32 numbufs; /* number of buffers allocated */ + u32 alloced; /* number of rcvarray entries used */ + u32 rcvtid_size; /* size of each eager rcv tid */ + u32 threshold; /* head update threshold */ + struct eager_buffer { + void *addr; + dma_addr_t dma; + ssize_t len; + } *buffers; + struct { + void *addr; + dma_addr_t dma; + } *rcvtids; +}; + +struct exp_tid_set { + struct list_head list; + u32 count; +}; + +struct tid_queue { + struct list_head queue_head; + /* queue head for QP TID resource waiters */ + u32 enqueue; /* count of tid enqueues */ + u32 dequeue; /* count of tid dequeues */ +}; + +struct hfi1_ctxtdata { + /* shadow the ctxt's RcvCtrl register */ + u64 rcvctrl; + /* rcvhdrq base, needs mmap before useful */ + void *rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + volatile __le64 *rcvhdrtail_kvaddr; + /* + * Shared page for kernel to signal user processes that send buffers + * need disarming. The process should call HFI1_CMD_DISARM_BUFS + * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set. + */ + unsigned long *user_event_mask; + /* when waiting for rcv or pioavail */ + wait_queue_head_t wait; + /* rcvhdrq size (for freeing) */ + size_t rcvhdrq_size; + /* number of rcvhdrq entries */ + u16 rcvhdrq_cnt; + /* size of each of the rcvhdrq entries */ + u16 rcvhdrqentsize; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t rcvhdrq_dma; + dma_addr_t rcvhdrqtailaddr_dma; + struct ctxt_eager_bufs egrbufs; + /* this receive context's assigned PIO ACK send context */ + struct send_context *sc; + + /* dynamic receive available interrupt timeout */ + u32 rcvavail_timeout; + /* + * number of opens (including slave sub-contexts) on this instance + * (ignoring forks, dup, etc. for now) + */ + int cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned ctxt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_cnt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_id; + u8 uuid[16]; + /* job key */ + u16 jkey; + /* number of RcvArray groups for this context. */ + u32 rcv_array_groups; + /* index of first eager TID entry. */ + u32 eager_base; + /* number of expected TID entries */ + u32 expected_count; + /* index of first expected TID entry. */ + u32 expected_base; + + struct exp_tid_set tid_group_list; + struct exp_tid_set tid_used_list; + struct exp_tid_set tid_full_list; + + /* lock protecting all Expected TID data of user contexts */ + struct mutex exp_mutex; + /* lock protecting all Expected TID data of kernel contexts */ + spinlock_t exp_lock; + /* Queue for QP's waiting for HW TID flows */ + struct tid_queue flow_queue; + /* Queue for QP's waiting for HW receive array entries */ + struct tid_queue rarr_queue; + + /* number of pio bufs for this ctxt (all procs, if shared) */ + u32 piocnt; + /* first pio buffer for this ctxt */ + u32 pio_base; + /* chip offset of PIO buffers for this ctxt */ + u32 piobufs; + /* per-context configuration flags */ + unsigned long flags; + /* per-context event flags for fileops/intr communication */ + unsigned long event_flags; + /* WAIT_RCV that timed out, no interrupt */ + u32 rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 pionowait; + /* total number of polled urgent packets */ + u32 urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 urgent_poll; + /* same size as task_struct .comm[], command that opened context */ + char comm[TASK_COMM_LEN]; + /* so file ops can get at unit */ + struct hfi1_devdata *dd; + /* so functions that need physical port can get it easily */ + struct hfi1_pportdata *ppd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subctxt_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subctxt_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subctxt_rcvhdr_base; + /* The version of the library which opened this ctxt */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* receive packet sequence counter */ + u8 seq_cnt; + u8 redirect_seq_cnt; + /* ctxt rcvhdrq head offset */ + u32 head; + u32 pkt_count; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; + /* interrupt handling */ + u64 imask; /* clear interrupt mask */ + int ireg; /* clear interrupt register */ + unsigned numa_id; /* numa node of this context */ + /* verbs stats per CTX */ + struct hfi1_opcode_stats_perctx *opstats; + /* + * This is the kernel thread that will keep making + * progress on the user sdma requests behind the scenes. + * There is one per context (shared contexts use the master's). + */ + struct task_struct *progress; + struct list_head sdma_queues; + /* protect sdma queues */ + spinlock_t sdma_qlock; + + /* Is ASPM interrupt supported for this context */ + bool aspm_intr_supported; + /* ASPM state (enabled/disabled) for this context */ + bool aspm_enabled; + /* Timer for re-enabling ASPM if interrupt activity quietens down */ + struct timer_list aspm_timer; + /* Lock to serialize between intr, timer intr and user threads */ + spinlock_t aspm_lock; + /* Is ASPM processing enabled for this context (in intr context) */ + bool aspm_intr_enable; + /* Last interrupt timestamp */ + ktime_t aspm_ts_last_intr; + /* Last timestamp at which we scheduled a timer for this context */ + ktime_t aspm_ts_timer_sched; + + /* Bit mask to track free TID RDMA HW flows */ + unsigned long flow_mask; + struct tid_flow_state flows[RXE_NUM_TID_FLOWS]; + + /* + * The interrupt handler for a particular receive context can vary + * throughout it's lifetime. This is not a lock protected data member so + * it must be updated atomically and the prev and new value must always + * be valid. Worst case is we process an extra interrupt and up to 64 + * packets with the wrong interrupt handler. + */ + int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded); +}; + +/* + * Represents a single packet at a high level. Put commonly computed things in + * here so we do not have to keep doing them over and over. The rule of thumb is + * if something is used one time to derive some value, store that something in + * here. If it is used multiple times, then store the result of that derivation + * in here. + */ +struct hfi1_packet { + void *ebuf; + void *hdr; + struct hfi1_ctxtdata *rcd; + __le32 *rhf_addr; + struct rvt_qp *qp; + struct ib_other_headers *ohdr; + u64 rhf; + u32 maxcnt; + u32 rhqoff; + u16 tlen; + s16 etail; + u8 hlen; + u8 numpkt; + u8 rsize; + u8 updegr; + u8 rcv_flags; + u8 etype; +}; + +/* + * Private data for snoop/capture support. + */ +struct hfi1_snoop_data { + int mode_flag; + struct cdev cdev; + struct device *class_dev; + /* protect snoop data */ + spinlock_t snoop_lock; + struct list_head queue; + wait_queue_head_t waitq; + void *filter_value; + int (*filter_callback)(void *hdr, void *data, void *value); + u64 dcc_cfg; /* saved value of DCC Cfg register */ +}; + +/* snoop mode_flag values */ +#define HFI1_PORT_SNOOP_MODE 1U +#define HFI1_PORT_CAPTURE_MODE 2U + +struct rvt_sge_state; + +/* + * Get/Set IB link-level config parameters for f_get/set_ib_cfg() + * Mostly for MADs that set or query link parameters, also ipath + * config interfaces + */ +#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */ +#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */ +#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */ +#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */ +#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */ +#define HFI1_IB_CFG_SPD 5 /* current Link spd */ +#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */ +#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */ +#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */ +#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */ +#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */ +#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */ +#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */ +#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */ +#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */ +#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */ +#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */ +#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */ +#define HFI1_IB_CFG_VL_HIGH_LIMIT 19 +#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */ +#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */ + +/* + * HFI or Host Link States + * + * These describe the states the driver thinks the logical and physical + * states are in. Used as an argument to set_link_state(). Implemented + * as bits for easy multi-state checking. The actual state can only be + * one. + */ +#define __HLS_UP_INIT_BP 0 +#define __HLS_UP_ARMED_BP 1 +#define __HLS_UP_ACTIVE_BP 2 +#define __HLS_DN_DOWNDEF_BP 3 /* link down default */ +#define __HLS_DN_POLL_BP 4 +#define __HLS_DN_DISABLE_BP 5 +#define __HLS_DN_OFFLINE_BP 6 +#define __HLS_VERIFY_CAP_BP 7 +#define __HLS_GOING_UP_BP 8 +#define __HLS_GOING_OFFLINE_BP 9 +#define __HLS_LINK_COOLDOWN_BP 10 + +#define HLS_UP_INIT BIT(__HLS_UP_INIT_BP) +#define HLS_UP_ARMED BIT(__HLS_UP_ARMED_BP) +#define HLS_UP_ACTIVE BIT(__HLS_UP_ACTIVE_BP) +#define HLS_DN_DOWNDEF BIT(__HLS_DN_DOWNDEF_BP) /* link down default */ +#define HLS_DN_POLL BIT(__HLS_DN_POLL_BP) +#define HLS_DN_DISABLE BIT(__HLS_DN_DISABLE_BP) +#define HLS_DN_OFFLINE BIT(__HLS_DN_OFFLINE_BP) +#define HLS_VERIFY_CAP BIT(__HLS_VERIFY_CAP_BP) +#define HLS_GOING_UP BIT(__HLS_GOING_UP_BP) +#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP) +#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP) + +#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) +#define HLS_DOWN ~(HLS_UP) + +/* use this MTU size if none other is given */ +#define HFI1_DEFAULT_ACTIVE_MTU 10240 +/* use this MTU size as the default maximum */ +#define HFI1_DEFAULT_MAX_MTU 10240 +/* default partition key */ +#define DEFAULT_PKEY 0xffff + +/* + * Possible fabric manager config parameters for fm_{get,set}_table() + */ +#define FM_TBL_VL_HIGH_ARB 1 /* Get/set VL high prio weights */ +#define FM_TBL_VL_LOW_ARB 2 /* Get/set VL low prio weights */ +#define FM_TBL_BUFFER_CONTROL 3 /* Get/set Buffer Control */ +#define FM_TBL_SC2VLNT 4 /* Get/set SC->VLnt */ +#define FM_TBL_VL_PREEMPT_ELEMS 5 /* Get (no set) VL preempt elems */ +#define FM_TBL_VL_PREEMPT_MATRIX 6 /* Get (no set) VL preempt matrix */ + +/* + * Possible "operations" for f_rcvctrl(ppd, op, ctxt) + * these are bits so they can be combined, e.g. + * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB + */ +#define HFI1_RCVCTRL_TAILUPD_ENB 0x01 +#define HFI1_RCVCTRL_TAILUPD_DIS 0x02 +#define HFI1_RCVCTRL_CTXT_ENB 0x04 +#define HFI1_RCVCTRL_CTXT_DIS 0x08 +#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10 +#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20 +#define HFI1_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */ +#define HFI1_RCVCTRL_PKEY_DIS 0x80 +#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400 +#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800 +#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000 +#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000 +#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000 +#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000 + +/* partition enforcement flags */ +#define HFI1_PART_ENFORCE_IN 0x1 +#define HFI1_PART_ENFORCE_OUT 0x2 + +/* how often we check for synthetic counter wrap around */ +#define SYNTH_CNT_TIME 2 + +/* Counter flags */ +#define CNTR_NORMAL 0x0 /* Normal counters, just read register */ +#define CNTR_SYNTH 0x1 /* Synthetic counters, saturate at all 1s */ +#define CNTR_DISABLED 0x2 /* Disable this counter */ +#define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */ +#define CNTR_VL 0x8 /* Per VL counter */ +#define CNTR_SDMA 0x10 +#define CNTR_INVALID_VL -1 /* Specifies invalid VL */ +#define CNTR_MODE_W 0x0 +#define CNTR_MODE_R 0x1 + +/* VLs Supported/Operational */ +#define HFI1_MIN_VLS_SUPPORTED 1 +#define HFI1_MAX_VLS_SUPPORTED 8 + +#define HFI1_GUIDS_PER_PORT 5 +#define HFI1_PORT_GUID_INDEX 0 + +static inline void incr_cntr64(u64 *cntr) +{ + if (*cntr < (u64)-1LL) + (*cntr)++; +} + +static inline void incr_cntr32(u32 *cntr) +{ + if (*cntr < (u32)-1LL) + (*cntr)++; +} + +#define MAX_NAME_SIZE 64 +struct hfi1_msix_entry { + enum irq_type type; + struct msix_entry msix; + void *arg; + char name[MAX_NAME_SIZE]; + cpumask_t mask; + struct irq_affinity_notify notify; +}; + +/* per-SL CCA information */ +struct cca_timer { + struct hrtimer hrtimer; + struct hfi1_pportdata *ppd; /* read-only */ + int sl; /* read-only */ + u16 ccti; /* read/write - current value of CCTI */ +}; + +struct link_down_reason { + /* + * SMA-facing value. Should be set from .latest when + * HLS_UP_* -> HLS_DN_* transition actually occurs. + */ + u8 sma; + u8 latest; +}; + +enum { + LO_PRIO_TABLE, + HI_PRIO_TABLE, + MAX_PRIO_TABLE +}; + +struct vl_arb_cache { + /* protect vl arb cache */ + spinlock_t lock; + struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE]; +}; + +/* + * The structure below encapsulates data relevant to a physical IB Port. + * Current chips support only one such port, but the separation + * clarifies things a bit. Note that to conform to IB conventions, + * port-numbers are one-based. The first or only port is port1. + */ +struct hfi1_pportdata { + struct hfi1_ibport ibport_data; + + struct hfi1_devdata *dd; + struct kobject pport_cc_kobj; + struct kobject sc2vl_kobj; + struct kobject sl2sc_kobj; + struct kobject vl2mtu_kobj; + + /* PHY support */ + struct qsfp_data qsfp_info; + /* Values for SI tuning of SerDes */ + u32 port_type; + u32 tx_preset_eq; + u32 tx_preset_noeq; + u32 rx_preset; + u8 local_atten; + u8 remote_atten; + u8 default_atten; + u8 max_power_class; + + /* GUIDs for this interface, in host order, guids[0] is a port guid */ + u64 guids[HFI1_GUIDS_PER_PORT]; + + /* GUID for peer interface, in host order */ + u64 neighbor_guid; + + /* up or down physical link state */ + u32 linkup; + + /* + * this address is mapped read-only into user processes so they can + * get status cheaply, whenever they want. One qword of status per port + */ + u64 *statusp; + + /* SendDMA related entries */ + + struct workqueue_struct *hfi1_wq; + + /* move out of interrupt context */ + struct work_struct link_vc_work; + struct work_struct link_up_work; + struct work_struct link_down_work; + struct work_struct sma_message_work; + struct work_struct freeze_work; + struct work_struct link_downgrade_work; + struct work_struct link_bounce_work; + struct delayed_work start_link_work; + /* host link state variables */ + struct mutex hls_lock; + u32 host_link_state; + + u32 lstate; /* logical link state */ + + /* these are the "32 bit" regs */ + + u32 ibmtu; /* The MTU programmed for this unit */ + /* + * Current max size IB packet (in bytes) including IB headers, that + * we can send. Changes when ibmtu changes. + */ + u32 ibmaxlen; + u32 current_egress_rate; /* units [10^6 bits/sec] */ + /* LID programmed for this instance */ + u16 lid; + /* list of pkeys programmed; 0 if not set */ + u16 pkeys[MAX_PKEY_VALUES]; + u16 link_width_supported; + u16 link_width_downgrade_supported; + u16 link_speed_supported; + u16 link_width_enabled; + u16 link_width_downgrade_enabled; + u16 link_speed_enabled; + u16 link_width_active; + u16 link_width_downgrade_tx_active; + u16 link_width_downgrade_rx_active; + u16 link_speed_active; + u8 vls_supported; + u8 vls_operational; + u8 actual_vls_operational; + /* LID mask control */ + u8 lmc; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 rx_pol_inv; + + u8 hw_pidx; /* physical port index */ + u8 port; /* IB port number and index into dd->pports - 1 */ + /* type of neighbor node */ + u8 neighbor_type; + u8 neighbor_normal; + u8 neighbor_fm_security; /* 1 if firmware checking is disabled */ + u8 neighbor_port_number; + u8 is_sm_config_started; + u8 offline_disabled_reason; + u8 is_active_optimize_enabled; + u8 driver_link_ready; /* driver ready for active link */ + u8 link_enabled; /* link enabled? */ + u8 linkinit_reason; + u8 local_tx_rate; /* rate given to 8051 firmware */ + u8 last_pstate; /* info only */ + u8 qsfp_retry_count; + + /* placeholders for IB MAD packet settings */ + u8 overrun_threshold; + u8 phy_error_threshold; + + /* Used to override LED behavior for things like maintenance beaconing*/ + /* + * Alternates per phase of blink + * [0] holds LED off duration, [1] holds LED on duration + */ + unsigned long led_override_vals[2]; + u8 led_override_phase; /* LSB picks from vals[] */ + atomic_t led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list led_override_timer; + + u32 sm_trap_qp; + u32 sa_qp; + + /* + * cca_timer_lock protects access to the per-SL cca_timer + * structures (specifically the ccti member). + */ + spinlock_t cca_timer_lock ____cacheline_aligned_in_smp; + struct cca_timer cca_timer[OPA_MAX_SLS]; + + /* List of congestion control table entries */ + struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX]; + + /* congestion entries, each entry corresponding to a SL */ + struct opa_congestion_setting_entry_shadow + congestion_entries[OPA_MAX_SLS]; + + /* + * cc_state_lock protects (write) access to the per-port + * struct cc_state. + */ + spinlock_t cc_state_lock ____cacheline_aligned_in_smp; + + struct cc_state __rcu *cc_state; + + /* Total number of congestion control table entries */ + u16 total_cct_entry; + + /* Bit map identifying service level */ + u32 cc_sl_control_map; + + /* CA's max number of 64 entry units in the congestion control table */ + u8 cc_max_table_entries; + + /* + * begin congestion log related entries + * cc_log_lock protects all congestion log related data + */ + spinlock_t cc_log_lock ____cacheline_aligned_in_smp; + u8 threshold_cong_event_map[OPA_MAX_SLS / 8]; + u16 threshold_event_counter; + struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS]; + int cc_log_idx; /* index for logging events */ + int cc_mad_idx; /* index for reporting events */ + /* end congestion log related entries */ + + struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE]; + + /* port relative counter buffer */ + u64 *cntrs; + /* port relative synthetic counter buffer */ + u64 *scntrs; + /* port_xmit_discards are synthesized from different egress errors */ + u64 port_xmit_discards; + u64 port_xmit_discards_vl[C_VL_COUNT]; + u64 port_xmit_constraint_errors; + u64 port_rcv_constraint_errors; + /* count of 'link_err' interrupts from DC */ + u64 link_downed; + /* number of times link retrained successfully */ + u64 link_up; + /* number of times a link unknown frame was reported */ + u64 unknown_frame_count; + /* port_ltp_crc_mode is returned in 'portinfo' MADs */ + u16 port_ltp_crc_mode; + /* port_crc_mode_enabled is the crc we support */ + u8 port_crc_mode_enabled; + /* mgmt_allowed is also returned in 'portinfo' MADs */ + u8 mgmt_allowed; + u8 part_enforce; /* partition enforcement flags */ + struct link_down_reason local_link_down_reason; + struct link_down_reason neigh_link_down_reason; + /* Value to be sent to link peer on LinkDown .*/ + u8 remote_link_down_reason; + /* Error events that will cause a port bounce. */ + u32 port_error_action; + struct work_struct linkstate_active_work; + /* Does this port need to prescan for FECNs */ + bool cc_prescan; +}; + +typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); + +typedef void (*opcode_handler)(struct hfi1_packet *packet); + +/* return values for the RHF receive functions */ +#define RHF_RCV_CONTINUE 0 /* keep going */ +#define RHF_RCV_DONE 1 /* stop, this packet processed */ +#define RHF_RCV_REPROCESS 2 /* stop. retain this packet */ + +struct rcv_array_data { + u8 group_size; + u16 ngroups; + u16 nctxt_extra; +}; + +struct per_vl_data { + u16 mtu; + struct send_context *sc; +}; + +/* 16 to directly index */ +#define PER_VL_SEND_CONTEXTS 16 + +struct err_info_rcvport { + u8 status_and_code; + u64 packet_flit1; + u64 packet_flit2; +}; + +struct err_info_constraint { + u8 status; + u16 pkey; + u32 slid; +}; + +struct hfi1_temp { + unsigned int curr; /* current temperature */ + unsigned int lo_lim; /* low temperature limit */ + unsigned int hi_lim; /* high temperature limit */ + unsigned int crit_lim; /* critical temperature limit */ + u8 triggers; /* temperature triggers */ +}; + +struct hfi1_i2c_bus { + struct hfi1_devdata *controlling_dd; /* current controlling device */ + struct i2c_adapter adapter; /* bus details */ + struct i2c_algo_bit_data algo; /* bus algorithm details */ + int num; /* bus number, 0 or 1 */ +}; + +/* common data between shared ASIC HFIs */ +struct hfi1_asic_data { + struct hfi1_devdata *dds[2]; /* back pointers */ + struct mutex asic_resource_mutex; + struct hfi1_i2c_bus *i2c_bus0; + struct hfi1_i2c_bus *i2c_bus1; +}; + +/* device data struct now contains only "general per-device" info. + * fields related to a physical IB port are in a hfi1_pportdata struct. + */ +struct sdma_engine; +struct sdma_vl_map; + +#define BOARD_VERS_MAX 96 /* how long the version string can be */ +#define SERIAL_MAX 16 /* length of the serial number */ + +typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64); +struct hfi1_devdata { + struct hfi1_ibdev verbs_dev; /* must be first */ + struct list_head list; + /* pointers to related structs for this device */ + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev user_cdev; + struct cdev diag_cdev; + struct cdev ui_cdev; + struct device *user_device; + struct device *diag_device; + struct device *ui_device; + + /* mem-mapped pointer to base of chip regs */ + u8 __iomem *kregbase; + /* end of mem-mapped chip space excluding sendbuf and user regs */ + u8 __iomem *kregend; + /* physical address of chip for io_remap, etc. */ + resource_size_t physaddr; + /* Per VL data. Enough for all VLs but not all elements are set/used. */ + struct per_vl_data vld[PER_VL_SEND_CONTEXTS]; + /* send context data */ + struct send_context_info *send_contexts; + /* map hardware send contexts to software index */ + u8 *hw_to_sw; + /* spinlock for allocating and releasing send context resources */ + spinlock_t sc_lock; + /* lock for pio_map */ + spinlock_t pio_map_lock; + /* Send Context initialization lock. */ + spinlock_t sc_init_lock; + /* lock for sdma_map */ + spinlock_t sde_map_lock; + /* array of kernel send contexts */ + struct send_context **kernel_send_context; + /* array of vl maps */ + struct pio_vl_map __rcu *pio_map; + /* default flags to last descriptor */ + u64 default_desc1; + + /* fields common to all SDMA engines */ + + volatile __le64 *sdma_heads_dma; /* DMA'ed by chip */ + dma_addr_t sdma_heads_phys; + void *sdma_pad_dma; /* DMA'ed by chip */ + dma_addr_t sdma_pad_phys; + /* for deallocation */ + size_t sdma_heads_size; + /* number from the chip */ + u32 chip_sdma_engines; + /* num used */ + u32 num_sdma; + /* array of engines sized by num_sdma */ + struct sdma_engine *per_sdma; + /* array of vl maps */ + struct sdma_vl_map __rcu *sdma_map; + /* SPC freeze waitqueue and variable */ + wait_queue_head_t sdma_unfreeze_wq; + atomic_t sdma_unfreeze_count; + + u32 lcb_access_count; /* count of LCB users */ + + /* common data between shared ASIC HFIs in this OS */ + struct hfi1_asic_data *asic_data; + + /* mem-mapped pointer to base of PIO buffers */ + void __iomem *piobase; + /* + * write-combining mem-mapped pointer to base of RcvArray + * memory. + */ + void __iomem *rcvarray_wc; + /* + * credit return base - a per-NUMA range of DMA address that + * the chip will use to update the per-context free counter + */ + struct credit_return_base *cr_base; + + /* send context numbers and sizes for each type */ + struct sc_config_sizes sc_sizes[SC_MAX]; + + char *boardname; /* human readable board info */ + + /* reset value */ + u64 z_int_counter; + u64 z_rcv_limit; + u64 z_send_schedule; + + u64 __percpu *send_schedule; + /* number of receive contexts in use by the driver */ + u32 num_rcv_contexts; + /* number of pio send contexts in use by the driver */ + u32 num_send_contexts; + /* + * number of ctxts available for PSM open + */ + u32 freectxts; + /* total number of available user/PSM contexts */ + u32 num_user_contexts; + /* base receive interrupt timeout, in CSR units */ + u32 rcv_intr_timeout_csr; + + u32 freezelen; /* max length of freezemsg */ + u64 __iomem *egrtidbase; + spinlock_t sendctrl_lock; /* protect changes to SendCtrl */ + spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */ + /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */ + spinlock_t uctxt_lock; /* rcd and user context changes */ + /* exclusive access to 8051 */ + spinlock_t dc8051_lock; + /* exclusive access to 8051 memory */ + spinlock_t dc8051_memlock; + int dc8051_timed_out; /* remember if the 8051 timed out */ + /* + * A page that will hold event notification bitmaps for all + * contexts. This page will be mapped into all processes. + */ + unsigned long *events; + /* + * per unit status, see also portdata statusp + * mapped read-only into user processes so they can get unit and + * IB link status cheaply + */ + struct hfi1_status *status; + + /* revision register shadow */ + u64 revision; + /* Base GUID for device (network order) */ + u64 base_guid; + + /* these are the "32 bit" regs */ + + /* value we put in kr_rcvhdrsize */ + u32 rcvhdrsize; + /* number of receive contexts the chip supports */ + u32 chip_rcv_contexts; + /* number of receive array entries */ + u32 chip_rcv_array_count; + /* number of PIO send contexts the chip supports */ + u32 chip_send_contexts; + /* number of bytes in the PIO memory buffer */ + u32 chip_pio_mem_size; + /* number of bytes in the SDMA memory buffer */ + u32 chip_sdma_mem_size; + + /* size of each rcvegrbuffer */ + u32 rcvegrbufsize; + /* log2 of above */ + u16 rcvegrbufsize_shift; + /* both sides of the PCIe link are gen3 capable */ + u8 link_gen3_capable; + /* default link down value (poll/sleep) */ + u8 link_default; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 lbus_width; + /* localbus speed in MHz */ + u32 lbus_speed; + int unit; /* unit # of this chip */ + int node; /* home node of this chip */ + + /* save these PCI fields to restore after a reset */ + u32 pcibar0; + u32 pcibar1; + u32 pci_rom; + u16 pci_command; + u16 pcie_devctl; + u16 pcie_lnkctl; + u16 pcie_devctl2; + u32 pci_msix0; + u32 pci_lnkctl3; + u32 pci_tph2; + + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer serial number format + */ + u8 serial[SERIAL_MAX]; + /* human readable board version */ + u8 boardversion[BOARD_VERS_MAX]; + u8 lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from CceRevision */ + u8 majrev; + /* chip minor rev, from CceRevision */ + u8 minrev; + /* hardware ID */ + u8 hfi1_id; + /* implementation code */ + u8 icode; + /* vAU of this device */ + u8 vau; + /* vCU of this device */ + u8 vcu; + /* link credits of this device */ + u16 link_credits; + /* initial vl15 credits to use */ + u16 vl15_init; + + /* + * Cached values for vau and vl15buf, read during verify cap + * interrupt. VL15 credits are to be kept at 0 and set when handing + * link-up interrupt. This addresses an issue where VL15 MAD packets + * could be received by one side of the link before the other is + * really ready to receive them. + */ + u8 vau_cached; + u16 vl15buf_cached; + + /* Misc small ints */ + u8 n_krcv_queues; + u8 qos_shift; + + u16 irev; /* implementation revision */ + u16 dc8051_ver; /* 8051 firmware version */ + + spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */ + struct platform_config platform_config; + struct platform_config_cache pcfg_cache; + + struct diag_client *diag_client; + + /* MSI-X information */ + struct hfi1_msix_entry *msix_entries; + u32 num_msix_entries; + + /* INTx information */ + u32 requested_intx_irq; /* did we request one? */ + char intx_name[MAX_NAME_SIZE]; /* INTx name */ + + /* general interrupt: mask of handled interrupts */ + u64 gi_mask[CCE_NUM_INT_CSRS]; + + struct rcv_array_data rcv_entries; + + /* cycle length of PS* counters in HW (in picoseconds) */ + u16 psxmitwait_check_rate; + + /* + * 64 bit synthetic counters + */ + struct timer_list synth_stats_timer; + + /* + * device counters + */ + char *cntrnames; + size_t cntrnameslen; + size_t ndevcntrs; + u64 *cntrs; + u64 *scntrs; + + /* + * remembered values for synthetic counters + */ + u64 last_tx; + u64 last_rx; + + /* + * per-port counters + */ + size_t nportcntrs; + char *portcntrnames; + size_t portcntrnameslen; + + struct hfi1_snoop_data hfi1_snoop; + + struct err_info_rcvport err_info_rcvport; + struct err_info_constraint err_info_rcv_constraint; + struct err_info_constraint err_info_xmit_constraint; + + atomic_t drop_packet; + u8 do_drop; + u8 err_info_uncorrectable; + u8 err_info_fmconfig; + + /* + * Software counters for the status bits defined by the + * associated error status registers + */ + u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS]; + u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS]; + u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS]; + u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS]; + u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS]; + u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS]; + u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS]; + + /* Software counter that spans all contexts */ + u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS]; + /* Software counter that spans all DMA engines */ + u64 sw_send_dma_eng_err_status_cnt[ + NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS]; + /* Software counter that aggregates all cce_err_status errors */ + u64 sw_cce_err_status_aggregate; + /* Software counter that aggregates all bypass packet rcv errors */ + u64 sw_rcv_bypass_packet_errors; + /* receive interrupt function */ + rhf_rcv_function_ptr normal_rhf_rcv_functions[8]; + + /* Save the enabled LCB error bits */ + u64 lcb_err_en; + + /* + * Handlers for outgoing data so that snoop/capture does not + * have to have its hooks in the send path + */ + send_routine process_pio_send ____cacheline_aligned_in_smp; + send_routine process_dma_send; + void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count); + /* hfi1_pportdata, points to array of (physical) port-specific + * data structs, indexed by pidx (0..n-1) + */ + struct hfi1_pportdata *pport; + /* receive context data */ + struct hfi1_ctxtdata **rcd; + u64 __percpu *int_counter; + /* device (not port) flags, basically device capabilities */ + u16 flags; + /* Number of physical ports available */ + u8 num_pports; + /* Lowest context number which can be used by user processes */ + u8 first_user_ctxt; + /* adding a new field here would make it part of this cacheline */ + + /* seqlock for sc2vl */ + seqlock_t sc2vl_lock ____cacheline_aligned_in_smp; + u64 sc2vl[4]; + /* receive interrupt functions */ + rhf_rcv_function_ptr *rhf_rcv_function_map; + u64 __percpu *rcv_limit; + u16 rhf_offset; /* offset of RHF within receive header entry */ + /* adding a new field here would make it part of this cacheline */ + + /* OUI comes from the HW. Used everywhere as 3 separate bytes. */ + u8 oui1; + u8 oui2; + u8 oui3; + u8 dc_shutdown; + + /* Timer and counter used to detect RcvBufOvflCnt changes */ + struct timer_list rcverr_timer; + + wait_queue_head_t event_queue; + + /* receive context tail dummy address */ + __le64 *rcvhdrtail_dummy_kvaddr; + dma_addr_t rcvhdrtail_dummy_dma; + + u32 rcv_ovfl_cnt; + /* Serialize ASPM enable/disable between multiple verbs contexts */ + spinlock_t aspm_lock; + /* Number of verbs contexts which have disabled ASPM */ + atomic_t aspm_disabled_cnt; + /* Keeps track of user space clients */ + atomic_t user_refcount; + /* Used to wait for outstanding user space clients before dev removal */ + struct completion user_comp; + bool eprom_available; /* true if EPROM is available for this device */ + bool aspm_supported; /* Does HW support ASPM */ + bool aspm_enabled; /* ASPM state: enabled/disabled */ + struct rhashtable sdma_rht; + + struct kobject kobj; +}; + +/* 8051 firmware version helper */ +#define dc8051_ver(a, b) ((a) << 8 | (b)) + +/* f_put_tid types */ +#define PT_EXPECTED 0 +#define PT_EAGER 1 +#define PT_INVALID 2 + +struct tid_rb_node; +struct mmu_rb_node; +struct mmu_rb_handler; + +/* Private data for file operations */ +struct hfi1_filedata { + struct hfi1_ctxtdata *uctxt; + unsigned subctxt; + struct hfi1_user_sdma_comp_q *cq; + struct hfi1_user_sdma_pkt_q *pq; + /* for cpu affinity; -1 if none */ + int rec_cpu_num; + u32 tid_n_pinned; + struct mmu_rb_handler *handler; + struct tid_rb_node **entry_to_rb; + spinlock_t tid_lock; /* protect tid_[limit,used] counters */ + u32 tid_limit; + u32 tid_used; + u32 *invalid_tids; + u32 invalid_tid_idx; + /* protect invalid_tids array and invalid_tid_idx */ + spinlock_t invalid_lock; + struct mm_struct *mm; +}; + +extern struct list_head hfi1_dev_list; +extern spinlock_t hfi1_devs_lock; +struct hfi1_devdata *hfi1_lookup(int unit); +extern u32 hfi1_cpulist_count; +extern unsigned long *hfi1_cpulist; + +extern unsigned int snoop_drop_send; +extern unsigned int snoop_force_capture; +int hfi1_init(struct hfi1_devdata *, int); +int hfi1_count_units(int *npresentp, int *nupp); +int hfi1_count_active_units(void); + +int hfi1_diag_add(struct hfi1_devdata *); +void hfi1_diag_remove(struct hfi1_devdata *); +void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup); + +void handle_user_interrupt(struct hfi1_ctxtdata *rcd); + +int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *); +int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *); +int hfi1_create_ctxts(struct hfi1_devdata *dd); +struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int); +int hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *, + struct hfi1_devdata *, u8, u8); +void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *); + +int handle_receive_interrupt(struct hfi1_ctxtdata *, int); +int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int); +int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int); +void set_all_slowpath(struct hfi1_devdata *dd); + +extern const struct pci_device_id hfi1_pci_tbl[]; + +/* receive packet handler dispositions */ +#define RCV_PKT_OK 0x0 /* keep going */ +#define RCV_PKT_LIMIT 0x1 /* stop, hit limit, start thread */ +#define RCV_PKT_DONE 0x2 /* stop, no more packets detected */ + +/* calculate the current RHF address */ +static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd) +{ + return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset; +} + +int hfi1_reset_device(int); + +/* return the driver's idea of the logical OPA port state */ +static inline u32 driver_lstate(struct hfi1_pportdata *ppd) +{ + /* + * The driver does some processing from the time the logical + * link state is at INIT to the time the SM can be notified + * as such. Return IB_PORT_DOWN until the software state + * is ready. + */ + if (ppd->lstate == IB_PORT_INIT && !(ppd->host_link_state & HLS_UP)) + return IB_PORT_DOWN; + else + return ppd->lstate; +} + +void receive_interrupt_work(struct work_struct *work); + +/* extract service channel from header and rhf */ +static inline int hdr2sc(struct ib_header *hdr, u64 rhf) +{ + return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) | + ((!!(rhf_dc_info(rhf))) << 4); +} + +#define HFI1_JKEY_WIDTH 16 +#define HFI1_JKEY_MASK (BIT(16) - 1) +#define HFI1_ADMIN_JKEY_RANGE 32 + +/* + * J_KEYs are split and allocated in the following groups: + * 0 - 31 - users with administrator privileges + * 32 - 63 - kernel protocols using KDETH packets + * 64 - 65535 - all other users using KDETH packets + */ +static inline u16 generate_jkey(kuid_t uid) +{ + u16 jkey = from_kuid(current_user_ns(), uid) & HFI1_JKEY_MASK; + + if (capable(CAP_SYS_ADMIN)) + jkey &= HFI1_ADMIN_JKEY_RANGE - 1; + else if (jkey < 64) + jkey |= BIT(HFI1_JKEY_WIDTH - 1); + + return jkey; +} + +/* + * active_egress_rate + * + * returns the active egress rate in units of [10^6 bits/sec] + */ +static inline u32 active_egress_rate(struct hfi1_pportdata *ppd) +{ + u16 link_speed = ppd->link_speed_active; + u16 link_width = ppd->link_width_active; + u32 egress_rate; + + if (link_speed == OPA_LINK_SPEED_25G) + egress_rate = 25000; + else /* assume OPA_LINK_SPEED_12_5G */ + egress_rate = 12500; + + switch (link_width) { + case OPA_LINK_WIDTH_4X: + egress_rate *= 4; + break; + case OPA_LINK_WIDTH_3X: + egress_rate *= 3; + break; + case OPA_LINK_WIDTH_2X: + egress_rate *= 2; + break; + default: + /* assume IB_WIDTH_1X */ + break; + } + + return egress_rate; +} + +/* + * egress_cycles + * + * Returns the number of 'fabric clock cycles' to egress a packet + * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock + * rate is (approximately) 805 MHz, the units of the returned value + * are (1/805 MHz). + */ +static inline u32 egress_cycles(u32 len, u32 rate) +{ + u32 cycles; + + /* + * cycles is: + * + * (length) [bits] / (rate) [bits/sec] + * --------------------------------------------------- + * fabric_clock_period == 1 /(805 * 10^6) [cycles/sec] + */ + + cycles = len * 8; /* bits */ + cycles *= 805; + cycles /= rate; + + return cycles; +} + +void set_link_ipg(struct hfi1_pportdata *ppd); +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, + u32 rqpn, u8 svc_type); +void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, + u32 pkey, u32 slid, u32 dlid, u8 sc5, + const struct ib_grh *old_grh); +#define PKEY_CHECK_INVALID -1 +int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, + u8 sc5, int8_t s_pkey_index); + +#define PACKET_EGRESS_TIMEOUT 350 +static inline void pause_for_credit_return(struct hfi1_devdata *dd) +{ + /* Pause at least 1us, to ensure chip returns all credits */ + u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000; + + udelay(usec ? usec : 1); +} + +/** + * sc_to_vlt() reverse lookup sc to vl + * @dd - devdata + * @sc5 - 5 bit sc + */ +static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5) +{ + unsigned seq; + u8 rval; + + if (sc5 >= OPA_MAX_SCS) + return (u8)(0xff); + + do { + seq = read_seqbegin(&dd->sc2vl_lock); + rval = *(((u8 *)dd->sc2vl) + sc5); + } while (read_seqretry(&dd->sc2vl_lock, seq)); + + return rval; +} + +#define PKEY_MEMBER_MASK 0x8000 +#define PKEY_LOW_15_MASK 0x7fff + +/* + * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent + * being an entry from the ingress partition key table), return 0 + * otherwise. Use the matching criteria for ingress partition keys + * specified in the OPAv1 spec., section 9.10.14. + */ +static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent) +{ + u16 mkey = pkey & PKEY_LOW_15_MASK; + u16 ment = ent & PKEY_LOW_15_MASK; + + if (mkey == ment) { + /* + * If pkey[15] is clear (limited partition member), + * is bit 15 in the corresponding table element + * clear (limited member)? + */ + if (!(pkey & PKEY_MEMBER_MASK)) + return !!(ent & PKEY_MEMBER_MASK); + return 1; + } + return 0; +} + +/* + * ingress_pkey_table_search - search the entire pkey table for + * an entry which matches 'pkey'. return 0 if a match is found, + * and 1 otherwise. + */ +static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey) +{ + int i; + + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } + return 1; +} + +/* + * ingress_pkey_table_fail - record a failure of ingress pkey validation, + * i.e., increment port_rcv_constraint_errors for the port, and record + * the 'error info' for this failure. + */ +static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey, + u16 slid) +{ + struct hfi1_devdata *dd = ppd->dd; + + incr_cntr64(&ppd->port_rcv_constraint_errors); + if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) { + dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK; + dd->err_info_rcv_constraint.slid = slid; + dd->err_info_rcv_constraint.pkey = pkey; + } +} + +/* + * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx + * is a hint as to the best place in the partition key table to begin + * searching. This function should not be called on the data path because + * of performance reasons. On datapath pkey check is expected to be done + * by HW and rcv_pkey_check function should be called instead. + */ +static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u8 idx, u16 slid) +{ + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + /* Is the pkey = 0x0, or 0x8000? */ + if ((pkey & PKEY_LOW_15_MASK) == 0) + goto bad; + + /* The most likely matching pkey has index 'idx' */ + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx])) + return 0; + + /* no match - try the whole table */ + if (!ingress_pkey_table_search(ppd, pkey)) + return 0; + +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* + * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. It only ensures pkey is vlid for QP0. This function + * should be called on the data path instead of ingress_pkey_check + * as on data path, pkey check is done by HW (except for QP0). + */ +static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u16 slid) +{ + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + return 0; +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* MTU handling */ + +/* MTU enumeration, 256-4k match IB */ +#define OPA_MTU_0 0 +#define OPA_MTU_256 1 +#define OPA_MTU_512 2 +#define OPA_MTU_1024 3 +#define OPA_MTU_2048 4 +#define OPA_MTU_4096 5 + +u32 lrh_max_header_bytes(struct hfi1_devdata *dd); +int mtu_to_enum(u32 mtu, int default_if_bad); +u16 enum_to_mtu(int); +static inline int valid_ib_mtu(unsigned int mtu) +{ + return mtu == 256 || mtu == 512 || + mtu == 1024 || mtu == 2048 || + mtu == 4096; +} + +static inline int valid_opa_max_mtu(unsigned int mtu) +{ + return mtu >= 2048 && + (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240); +} + +int set_mtu(struct hfi1_pportdata *); + +int hfi1_set_lid(struct hfi1_pportdata *, u32, u8); +void hfi1_disable_after_error(struct hfi1_devdata *); +int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int); +int hfi1_rcvbuf_validate(u32, u8, u16 *); + +int fm_get_table(struct hfi1_pportdata *, int, void *); +int fm_set_table(struct hfi1_pportdata *, int, void *); + +void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf); +void reset_link_credits(struct hfi1_devdata *dd); +void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu); + +int snoop_recv_handler(struct hfi1_packet *packet); +int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); +int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); +void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count); +int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc); + +static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd) +{ + return ppd->dd; +} + +static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev) +{ + return container_of(dev, struct hfi1_devdata, verbs_dev); +} + +static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev) +{ + return dd_from_dev(to_idev(ibdev)); +} + +static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp) +{ + return container_of(ibp, struct hfi1_pportdata, ibport_data); +} + +static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi) +{ + return container_of(rdi, struct hfi1_ibdev, rdi); +} + +static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + WARN_ON(pidx >= dd->num_pports); + return &dd->pport[pidx].ibport_data; +} + +static inline struct hfi1_ibport *rcd_to_iport(struct hfi1_ctxtdata *rcd) +{ + return &rcd->ppd->ibport_data; +} + +void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp); +static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt, + bool do_cnp) +{ + struct ib_other_headers *ohdr = pkt->ohdr; + u32 bth1; + + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { + hfi1_process_ecn_slowpath(qp, pkt, do_cnp); + return bth1 & HFI1_FECN_SMASK; + } + return false; +} + +/* + * Return the indexed PKEY from the port PKEY table. + */ +static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 ret; + + if (index >= ARRAY_SIZE(ppd->pkeys)) + ret = 0; + else + ret = ppd->pkeys[index]; + + return ret; +} + +/* + * Return the indexed GUID from the port GUIDs table. + */ +static inline __be64 get_sguid(struct hfi1_ibport *ibp, unsigned int index) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + WARN_ON(index >= HFI1_GUIDS_PER_PORT); + return cpu_to_be64(ppd->guids[index]); +} + +/* + * Called by readers of cc_state only, must call under rcu_read_lock(). + */ +static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) +{ + return rcu_dereference(ppd->cc_state); +} + +/* + * Called by writers of cc_state only, must call under cc_state_lock. + */ +static inline +struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) +{ + return rcu_dereference_protected(ppd->cc_state, + lockdep_is_held(&ppd->cc_state_lock)); +} + +/* + * values for dd->flags (_device_ related flags) + */ +#define HFI1_INITTED 0x1 /* chip and driver up and initted */ +#define HFI1_PRESENT 0x2 /* chip accesses can be done */ +#define HFI1_FROZEN 0x4 /* chip in SPC freeze */ +#define HFI1_HAS_SDMA_TIMEOUT 0x8 +#define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */ +#define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */ + +/* IB dword length mask in PBC (lower 11 bits); same for all chips */ +#define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1) + +/* ctxt_flag bit offsets */ + /* context has been setup */ +#define HFI1_CTXT_SETUP_DONE 1 + /* waiting for a packet to arrive */ +#define HFI1_CTXT_WAITING_RCV 2 + /* master has not finished initializing */ +#define HFI1_CTXT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define HFI1_CTXT_WAITING_URG 5 + +/* free up any allocated data at closes */ +struct hfi1_devdata *hfi1_init_dd(struct pci_dev *, + const struct pci_device_id *); +void hfi1_free_devdata(struct hfi1_devdata *); +struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra); + +/* LED beaconing functions */ +void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, + unsigned int timeoff); +void shutdown_led_override(struct hfi1_pportdata *ppd); + +#define HFI1_CREDIT_RETURN_RATE (100) + +/* + * The number of words for the KDETH protocol field. If this is + * larger then the actual field used, then part of the payload + * will be in the header. + * + * Optimally, we want this sized so that a typical case will + * use full cache lines. The typical local KDETH header would + * be: + * + * Bytes Field + * 8 LRH + * 12 BHT + * ?? KDETH + * 8 RHF + * --- + * 28 + KDETH + * + * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS + */ +#define DEFAULT_RCVHDRSIZE 9 + +/* + * Maximal header byte count: + * + * Bytes Field + * 8 LRH + * 40 GRH (optional) + * 12 BTH + * ?? KDETH + * 8 RHF + * --- + * 68 + KDETH + * + * We also want to maintain a cache line alignment to assist DMA'ing + * of the header bytes. Round up to a good size. + */ +#define DEFAULT_RCVHDR_ENTSIZE 32 + +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, + u32 nlocked, u32 npages); +int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, + size_t npages, bool writable, struct page **pages); +void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, + size_t npages, bool dirty); + +static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + /* + * volatile because it's a DMA target from the chip, routine is + * inlined, and don't want register caching or reordering. + */ + return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr); +} + +/* + * sysfs interface. + */ + +extern const char ib_hfi1_version[]; + +int hfi1_device_create(struct hfi1_devdata *); +void hfi1_device_remove(struct hfi1_devdata *); + +int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj); +int hfi1_verbs_register_sysfs(struct hfi1_devdata *); +void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *); +/* Hook for sysfs read of QSFP */ +int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); + +int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *); +void hfi1_pcie_cleanup(struct pci_dev *); +int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *); +void hfi1_pcie_ddcleanup(struct hfi1_devdata *); +void hfi1_pcie_flr(struct hfi1_devdata *); +int pcie_speeds(struct hfi1_devdata *); +void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *); +void hfi1_enable_intx(struct pci_dev *); +void restore_pci_variables(struct hfi1_devdata *dd); +int do_pcie_gen3_transition(struct hfi1_devdata *dd); +int parse_platform_config(struct hfi1_devdata *dd); +int get_platform_config_field(struct hfi1_devdata *dd, + enum platform_config_table_type_encoding + table_type, int table_index, int field_index, + u32 *data, u32 len); + +const char *get_unit_name(int unit); +const char *get_card_name(struct rvt_dev_info *rdi); +struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +static inline void flush_wc(void) +{ + asm volatile("sfence" : : : "memory"); +} + +void handle_eflags(struct hfi1_packet *packet); +int process_receive_ib(struct hfi1_packet *packet); +int process_receive_bypass(struct hfi1_packet *packet); +int process_receive_error(struct hfi1_packet *packet); +int kdeth_process_expected(struct hfi1_packet *packet); +int kdeth_process_eager(struct hfi1_packet *packet); +int process_receive_invalid(struct hfi1_packet *packet); + +extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8]; + +/* global module parameter variables */ +extern unsigned int hfi1_max_mtu; +extern unsigned int hfi1_cu; +extern unsigned int user_credit_return_threshold; +extern int num_user_contexts; +extern unsigned long n_krcvqs; +extern uint krcvqs[]; +extern int krcvqsset; +extern uint kdeth_qp; +extern uint loopback; +extern uint quick_linkup; +extern uint rcv_intr_timeout; +extern uint rcv_intr_count; +extern uint rcv_intr_dynamic; +extern ushort link_crc_mask; + +extern struct mutex hfi1_mutex; + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +#define DRIVER_NAME "hfi1" +#define HFI1_USER_MINOR_BASE 0 +#define HFI1_TRACE_MINOR 127 +#define HFI1_DIAGPKT_MINOR 128 +#define HFI1_DIAG_MINOR_BASE 129 +#define HFI1_SNOOP_CAPTURE_BASE 200 +#define HFI1_NMINORS 255 + +#define PCI_VENDOR_ID_INTEL 0x8086 +#define PCI_DEVICE_ID_INTEL0 0x24f0 +#define PCI_DEVICE_ID_INTEL1 0x24f1 + +#define HFI1_PKT_USER_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK) + +#define HFI1_PKT_KERNEL_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK) + +static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, + u16 ctxt_type) +{ + u64 base_sc_integrity; + + /* + * No integrity checks if HFI1_CAP_NO_INTEGRITY is set + * or driver is snooping + */ + if (HFI1_CAP_IS_KSET(NO_INTEGRITY) || + (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)) + return 0; + + base_sc_integrity = + SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (ctxt_type == SC_USER) + base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY; + else if (ctxt_type != SC_KERNEL) + base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; + + /* turn on send-side job key checks if !A0 */ + if (!is_ax(dd)) + base_sc_integrity |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + + return base_sc_integrity; +} + +static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) +{ + u64 base_sdma_integrity; + + /* No integrity checks if HFI1_CAP_NO_INTEGRITY is set */ + if (HFI1_CAP_IS_KSET(NO_INTEGRITY)) + return 0; + + base_sdma_integrity = + SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (!HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)) + base_sdma_integrity |= + SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK; + + /* turn on send-side job key checks if !A0 */ + if (!is_ax(dd)) + base_sdma_integrity |= + SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + + return base_sdma_integrity; +} + +/* + * hfi1_early_err is used (only!) to print early errors before devdata is + * allocated, or when dd->pcidev may not be valid, and at the tail end of + * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is + * the same as dd_dev_err, but is used when the message really needs + * the IB port# to be definitive as to what's happening.. + */ +#define hfi1_early_err(dev, fmt, ...) \ + dev_err(dev, fmt, ##__VA_ARGS__) + +#define hfi1_early_info(dev, fmt, ...) \ + dev_info(dev, fmt, ##__VA_ARGS__) + +#define dd_dev_emerg(dd, fmt, ...) \ + dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) +#define dd_dev_err(dd, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) +#define dd_dev_warn(dd, fmt, ...) \ + dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_warn_ratelimited(dd, fmt, ...) \ + dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_info(dd, fmt, ...) \ + dev_info(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_dbg(dd, fmt, ...) \ + dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define hfi1_dev_porterr(dd, port, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ + get_unit_name((dd)->unit), (port), ##__VA_ARGS__) + +/* + * this is used for formatting hw error messages... + */ +struct hfi1_hwerror_msgs { + u64 mask; + const char *msg; + size_t sz; +}; + +/* in intr.c... */ +void hfi1_format_hwerrors(u64 hwerrs, + const struct hfi1_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t lmsg); + +#define USER_OPCODE_CHECK_VAL 0xC0 +#define USER_OPCODE_CHECK_MASK 0xC0 +#define OPCODE_CHECK_VAL_DISABLED 0x0 +#define OPCODE_CHECK_MASK_DISABLED 0x0 + +static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + dd->z_int_counter = get_all_cpu_total(dd->int_counter); + dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit); + dd->z_send_schedule = get_all_cpu_total(dd->send_schedule); + + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->ibport_data.rvp.z_rc_acks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_acks); + ppd->ibport_data.rvp.z_rc_qacks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks); + } +} + +/* Control LED state */ +static inline void setextled(struct hfi1_devdata *dd, u32 on) +{ + if (on) + write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F); + else + write_csr(dd, DCC_CFG_LED_CNTRL, 0x10); +} + +/* return the i2c resource given the target */ +static inline u32 i2c_target(u32 target) +{ + return target ? CR_I2C2 : CR_I2C1; +} + +/* return the i2c chain chip resource that this HFI uses for QSFP */ +static inline u32 qsfp_resource(struct hfi1_devdata *dd) +{ + return i2c_target(dd->hfi1_id); +} + +/* Is this device integrated or discrete? */ +static inline bool is_integrated(struct hfi1_devdata *dd) +{ + return dd->pcidev->device == PCI_DEVICE_ID_INTEL1; +} + +int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); + +#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) + +#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } +#define show_packettype(etype) \ +__print_symbolic(etype, \ + packettype_name(EXPECTED), \ + packettype_name(EAGER), \ + packettype_name(IB), \ + packettype_name(ERROR), \ + packettype_name(BYPASS)) + +#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } +#define show_ib_opcode(opcode) \ +__print_symbolic(opcode, \ + ib_opcode_name(RC_SEND_FIRST), \ + ib_opcode_name(RC_SEND_MIDDLE), \ + ib_opcode_name(RC_SEND_LAST), \ + ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_SEND_ONLY), \ + ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_FIRST), \ + ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(RC_RDMA_WRITE_LAST), \ + ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_READ_REQUEST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ + ib_opcode_name(RC_ACKNOWLEDGE), \ + ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ + ib_opcode_name(RC_COMPARE_SWAP), \ + ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(TID_RDMA_WRITE_REQ), \ + ib_opcode_name(TID_RDMA_WRITE_RESP), \ + ib_opcode_name(TID_RDMA_WRITE_DATA), \ + ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \ + ib_opcode_name(TID_RDMA_READ_REQ), \ + ib_opcode_name(TID_RDMA_READ_RESP), \ + ib_opcode_name(TID_RDMA_ACK), \ + ib_opcode_name(UC_SEND_FIRST), \ + ib_opcode_name(UC_SEND_MIDDLE), \ + ib_opcode_name(UC_SEND_LAST), \ + ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_SEND_ONLY), \ + ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_FIRST), \ + ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(UC_RDMA_WRITE_LAST), \ + ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UD_SEND_ONLY), \ + ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(CNP)) +#endif /* _HFI1_KERNEL_H */ diff --git a/kernel/include/hfi1/hfi1_user.h b/kernel/include/hfi1/hfi1_user.h new file mode 100644 index 00000000..4705969a --- /dev/null +++ b/kernel/include/hfi1/hfi1_user.h @@ -0,0 +1,439 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +#ifndef _LINUX__HFI1_USER_H +#define _LINUX__HFI1_USER_H + +#include + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of hfi1_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures change in an incompatible + * way. The driver must be the same for initialization to succeed. + */ +#define HFI1_USER_SWMAJOR 6 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define HFI1_USER_SWMINOR 3 + +/* + * We will encode the major/minor inside a single 32bit version number. + */ +#define HFI1_SWMAJOR_SHIFT 16 + +/* + * Set of HW and driver capability/feature bits. + * These bit values are used to configure enabled/disabled HW and + * driver features. The same set of bits are communicated to user + * space. + */ +#define HFI1_CAP_DMA_RTAIL (1UL << 0) /* Use DMA'ed RTail value */ +#define HFI1_CAP_SDMA (1UL << 1) /* Enable SDMA support */ +#define HFI1_CAP_SDMA_AHG (1UL << 2) /* Enable SDMA AHG support */ +#define HFI1_CAP_EXTENDED_PSN (1UL << 3) /* Enable Extended PSN support */ +#define HFI1_CAP_HDRSUPP (1UL << 4) /* Enable Header Suppression */ +#define HFI1_CAP_TID_RDMA (1UL << 5) /* Enable TID RDMA operations */ +#define HFI1_CAP_USE_SDMA_HEAD (1UL << 6) /* DMA Hdr Q tail vs. use CSR */ +#define HFI1_CAP_MULTI_PKT_EGR (1UL << 7) /* Enable multi-packet Egr buffs*/ +#define HFI1_CAP_NODROP_RHQ_FULL (1UL << 8) /* Don't drop on Hdr Q full */ +#define HFI1_CAP_NODROP_EGR_FULL (1UL << 9) /* Don't drop on EGR buffs full */ +#define HFI1_CAP_TID_UNMAP (1UL << 10) /* Disable Expected TID caching */ +#define HFI1_CAP_PRINT_UNIMPL (1UL << 11) /* Show for unimplemented feats */ +#define HFI1_CAP_ALLOW_PERM_JKEY (1UL << 12) /* Allow use of permissive JKEY */ +#define HFI1_CAP_NO_INTEGRITY (1UL << 13) /* Enable ctxt integrity checks */ +#define HFI1_CAP_PKEY_CHECK (1UL << 14) /* Enable ctxt PKey checking */ +#define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */ +#define HFI1_CAP_OPFN (1UL << 16) /* Enable the OPFN protocol */ +#define HFI1_CAP_SDMA_HEAD_CHECK (1UL << 17) /* SDMA head checking */ +#define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */ + +#define HFI1_RCVHDR_ENTSIZE_2 (1UL << 0) +#define HFI1_RCVHDR_ENTSIZE_16 (1UL << 1) +#define HFI1_RCVDHR_ENTSIZE_32 (1UL << 2) + +/* User commands. */ +#define HFI1_CMD_ASSIGN_CTXT 1 /* allocate HFI and context */ +#define HFI1_CMD_CTXT_INFO 2 /* find out what resources we got */ +#define HFI1_CMD_USER_INFO 3 /* set up userspace */ +#define HFI1_CMD_TID_UPDATE 4 /* update expected TID entries */ +#define HFI1_CMD_TID_FREE 5 /* free expected TID entries */ +#define HFI1_CMD_CREDIT_UPD 6 /* force an update of PIO credit */ + +#define HFI1_CMD_RECV_CTRL 8 /* control receipt of packets */ +#define HFI1_CMD_POLL_TYPE 9 /* set the kind of polling we want */ +#define HFI1_CMD_ACK_EVENT 10 /* ack & clear user status bits */ +#define HFI1_CMD_SET_PKEY 11 /* set context's pkey */ +#define HFI1_CMD_CTXT_RESET 12 /* reset context's HW send context */ +#define HFI1_CMD_TID_INVAL_READ 13 /* read TID cache invalidations */ +#define HFI1_CMD_GET_VERS 14 /* get the version of the user cdev */ + +/* + * User IOCTLs can not go above 128 if they do then see common.h and change the + * base for the snoop ioctl + */ +#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl/ioctl-number.txt */ + +/* + * Make the ioctls occupy the last 0xf0-0xff portion of the IB range + */ +#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0) + +struct hfi1_cmd; +#define HFI1_IOCTL_ASSIGN_CTXT \ + _IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info) +#define HFI1_IOCTL_CTXT_INFO \ + _IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info) +#define HFI1_IOCTL_USER_INFO \ + _IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info) +#define HFI1_IOCTL_TID_UPDATE \ + _IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info) +#define HFI1_IOCTL_TID_FREE \ + _IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info) +#define HFI1_IOCTL_CREDIT_UPD \ + _IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD)) +#define HFI1_IOCTL_RECV_CTRL \ + _IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int) +#define HFI1_IOCTL_POLL_TYPE \ + _IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int) +#define HFI1_IOCTL_ACK_EVENT \ + _IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long) +#define HFI1_IOCTL_SET_PKEY \ + _IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16) +#define HFI1_IOCTL_CTXT_RESET \ + _IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET)) +#define HFI1_IOCTL_TID_INVAL_READ \ + _IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info) +#define HFI1_IOCTL_GET_VERS \ + _IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int) + +#define _HFI1_EVENT_FROZEN_BIT 0 +#define _HFI1_EVENT_LINKDOWN_BIT 1 +#define _HFI1_EVENT_LID_CHANGE_BIT 2 +#define _HFI1_EVENT_LMC_CHANGE_BIT 3 +#define _HFI1_EVENT_SL2VL_CHANGE_BIT 4 +#define _HFI1_EVENT_TID_MMU_NOTIFY_BIT 5 +#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_TID_MMU_NOTIFY_BIT + +#define HFI1_EVENT_FROZEN (1UL << _HFI1_EVENT_FROZEN_BIT) +#define HFI1_EVENT_LINKDOWN (1UL << _HFI1_EVENT_LINKDOWN_BIT) +#define HFI1_EVENT_LID_CHANGE (1UL << _HFI1_EVENT_LID_CHANGE_BIT) +#define HFI1_EVENT_LMC_CHANGE (1UL << _HFI1_EVENT_LMC_CHANGE_BIT) +#define HFI1_EVENT_SL2VL_CHANGE (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT) +#define HFI1_EVENT_TID_MMU_NOTIFY (1UL << _HFI1_EVENT_TID_MMU_NOTIFY_BIT) + +/* + * These are the status bits readable (in ASCII form, 64bit value) + * from the "status" sysfs file. For binary compatibility, values + * must remain as is; removed states can be reused for different + * purposes. + */ +#define HFI1_STATUS_INITTED 0x1 /* basic initialization done */ +/* Chip has been found and initialized */ +#define HFI1_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define HFI1_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define HFI1_STATUS_IB_CONF 0x80 +/* A Fatal hardware error has occurred. */ +#define HFI1_STATUS_HWERROR 0x200 + +/* + * Number of supported shared contexts. + * This is the maximum number of software contexts that can share + * a hardware send/receive context. + */ +#define HFI1_MAX_SHARED_CTXTS 8 + +/* + * Poll types + */ +#define HFI1_POLL_TYPE_ANYRCV 0x0 +#define HFI1_POLL_TYPE_URGENT 0x1 + +/* + * This structure is passed to the driver to tell it where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct hfi1_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to HFI1_USER_SWVERSION. + */ + __u32 userversion; + __u32 pad; + /* + * If two or more processes wish to share a context, each process + * must set the subcontext_cnt and subcontext_id to the same + * values. The only restriction on the subcontext_id is that + * it be unique for a given node. + */ + __u16 subctxt_cnt; + __u16 subctxt_id; + /* 128bit UUID passed in by PSM. */ + __u8 uuid[16]; +}; + +struct hfi1_ctxt_info { + __u64 runtime_flags; /* chip/drv runtime flags (HFI1_CAP_*) */ + __u32 rcvegr_size; /* size of each eager buffer */ + __u16 num_active; /* number of active units */ + __u16 unit; /* unit (chip) assigned to caller */ + __u16 ctxt; /* ctxt on unit assigned to caller */ + __u16 subctxt; /* subctxt on unit assigned to caller */ + __u16 rcvtids; /* number of Rcv TIDs for this context */ + __u16 credits; /* number of PIO credits for this context */ + __u16 numa_node; /* NUMA node of the assigned device */ + __u16 rec_cpu; /* cpu # for affinity (0xffff if none) */ + __u16 send_ctxt; /* send context in use by this user context */ + __u16 egrtids; /* number of RcvArray entries for Eager Rcvs */ + __u16 rcvhdrq_cnt; /* number of RcvHdrQ entries */ + __u16 rcvhdrq_entsize; /* size (in bytes) for each RcvHdrQ entry */ + __u16 sdma_ring_size; /* number of entries in SDMA request ring */ +}; + +struct hfi1_tid_info { + /* virtual address of first page in transfer */ + __u64 vaddr; + /* pointer to tid array. this array is big enough */ + __u64 tidlist; + /* number of tids programmed by this request */ + __u32 tidcnt; + /* length of transfer buffer programmed by this request */ + __u32 length; +}; + +enum hfi1_sdma_comp_state { + FREE = 0, + QUEUED, + COMPLETE, + ERROR +}; + +/* + * SDMA completion ring entry + */ +struct hfi1_sdma_comp_entry { + __u32 status; + __u32 errcode; +}; + +/* + * Device status and notifications from driver to user-space. + */ +struct hfi1_status { + __u64 dev; /* device/hw status bits */ + __u64 port; /* port state and status bits */ + char freezemsg[0]; +}; + +/* + * This structure is returned by the driver immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explicit pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct hfi1_base_info { + /* version of hardware, for feature checking. */ + __u32 hw_version; + /* version of software, for feature checking. */ + __u32 sw_version; + /* Job key */ + __u16 jkey; + __u16 padding1; + /* + * The special QP (queue pair) value that identifies PSM + * protocol packet from standard IB packets. + */ + __u32 bthqp; + /* PIO credit return address, */ + __u64 sc_credits_addr; + /* + * Base address of write-only pio buffers for this process. + * Each buffer has sendpio_credits*64 bytes. + */ + __u64 pio_bufbase_sop; + /* + * Base address of write-only pio buffers for this process. + * Each buffer has sendpio_credits*64 bytes. + */ + __u64 pio_bufbase; + /* address where receive buffer queue is mapped into */ + __u64 rcvhdr_bufbase; + /* base address of Eager receive buffers. */ + __u64 rcvegr_bufbase; + /* base address of SDMA completion ring */ + __u64 sdma_comp_bufbase; + /* + * User register base for init code, not to be used directly by + * protocol or applications. Always maps real chip register space. + * the register addresses are: + * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail, + * ur_rcvtidflow + */ + __u64 user_regbase; + /* notification events */ + __u64 events_bufbase; + /* status page */ + __u64 status_bufbase; + /* rcvhdrtail update */ + __u64 rcvhdrtail_base; + /* + * shared memory pages for subctxts if ctxt is shared; these cover + * all the processes in the group sharing a single context. + * all have enough space for the num_subcontexts value on this job. + */ + __u64 subctxt_uregbase; + __u64 subctxt_rcvegrbuf; + __u64 subctxt_rcvhdrbuf; +}; + +enum sdma_req_opcode { + EXPECTED = 0, + EAGER +}; + +#define HFI1_SDMA_REQ_VERSION_MASK 0xF +#define HFI1_SDMA_REQ_VERSION_SHIFT 0x0 +#define HFI1_SDMA_REQ_OPCODE_MASK 0xF +#define HFI1_SDMA_REQ_OPCODE_SHIFT 0x4 +#define HFI1_SDMA_REQ_IOVCNT_MASK 0xFF +#define HFI1_SDMA_REQ_IOVCNT_SHIFT 0x8 + +struct sdma_req_info { + /* + * bits 0-3 - version (currently unused) + * bits 4-7 - opcode (enum sdma_req_opcode) + * bits 8-15 - io vector count + */ + __u16 ctrl; + /* + * Number of fragments contained in this request. + * User-space has already computed how many + * fragment-sized packet the user buffer will be + * split into. + */ + __u16 npkts; + /* + * Size of each fragment the user buffer will be + * split into. + */ + __u16 fragsize; + /* + * Index of the slot in the SDMA completion ring + * this request should be using. User-space is + * in charge of managing its own ring. + */ + __u16 comp_idx; +} __packed; + +/* + * SW KDETH header. + * swdata is SW defined portion. + */ +struct hfi1_kdeth_header { + __le32 ver_tid_offset; + __le16 jkey; + __le16 hcrc; + __le32 swdata[7]; +} __packed; + +/* + * Structure describing the headers that User space uses. The + * structure above is a subset of this one. + */ +struct hfi1_pkt_header { + __le16 pbc[4]; + __be16 lrh[4]; + __be32 bth[3]; + struct hfi1_kdeth_header kdeth; +} __packed; + + +/* + * The list of usermode accessible registers. + */ +enum hfi1_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* (RO) Receive Eager Offset Tail */ + ur_rcvegroffsettail = 4, + /* For internal use only; max register number. */ + ur_maxreg, + /* (RW) Receive TID flow table */ + ur_rcvtidflowtable = 256 +}; + +#endif /* _LINIUX__HFI1_USER_H */ diff --git a/kernel/include/hfi1/iowait.h b/kernel/include/hfi1/iowait.h new file mode 100644 index 00000000..829d9368 --- /dev/null +++ b/kernel/include/hfi1/iowait.h @@ -0,0 +1,410 @@ +#ifndef _HFI1_IOWAIT_H +#define _HFI1_IOWAIT_H +/* + * Copyright(c) 2015 - 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +#include "sdma_txreq.h" + +/* + * typedef (*restart_t)() - restart callback + * @work: pointer to work structure + */ +typedef void (*restart_t)(struct work_struct *work); + +#define IOWAIT_PENDING_IB 0x0 +#define IOWAIT_PENDING_TID 0x1 + +/* + * A QP can have multiple Send Engines (SEs). + * + * The current use case is for supporting a TID RDMA + * packet build/xmit mechanism independent from verbs. + */ +#define IOWAIT_SES 2 +#define IOWAIT_IB_SE 0 +#define IOWAIT_TID_SE 1 + +struct sdma_txreq; +struct sdma_engine; +/** + * @iowork: the work struct + * @tx_head: list of prebuilt packets + * @iow: the parent iowait structure + * + * This structure is the work item (process) specific + * details associated with the each of the two SEs of the + * QP. + * + * The workstruct and the queued TXs are unique to each + * SE. + */ +struct iowait; +struct iowait_work { + struct work_struct iowork; + struct list_head tx_head; + struct iowait *iow; +}; + +/** + * @list: used to add/insert into QP/PQ wait lists + * @tx_head: overflow list of sdma_txreq's + * @sleep: no space callback + * @wakeup: space callback wakeup + * @sdma_drained: sdma count drained + * @lock: lock protected head of wait queue + * @iowork: workqueue overhead + * @wait_dma: wait for sdma_busy == 0 + * @wait_pio: wait for pio_busy == 0 + * @sdma_busy: # of packets in flight + * @count: total number of descriptors in tx_head'ed list + * @tx_limit: limit for overflow queuing + * @tx_count: number of tx entry's in tx_head'ed list + * @flags: wait flags (one per QP) + * @wait: SE array + * + * This is to be embedded in user's state structure + * (QP or PQ). + * + * The sleep and wakeup members are a + * bit misnamed. They do not strictly + * speaking sleep or wake up, but they + * are callbacks for the ULP to implement + * what ever queuing/dequeuing of + * the embedded iowait and its containing struct + * when a resource shortage like SDMA ring space is seen. + * + * Both potentially have locks help + * so sleeping is not allowed. + * + * The wait_dma member along with the iow + * + * The lock field is used by waiters to record + * the seqlock_t that guards the list head. + * Waiters explicity know that, but the destroy + * code that unwaits QPs does not. + */ +struct iowait { + struct list_head list; + int (*sleep)( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + unsigned seq); + void (*wakeup)(struct iowait *wait, int reason); + void (*sdma_drained)(struct iowait *wait); + seqlock_t *lock; + wait_queue_head_t wait_dma; + wait_queue_head_t wait_pio; + atomic_t sdma_busy; + atomic_t pio_busy; + u32 count; + u32 tx_limit; + u32 tx_count; + unsigned long flags; + struct iowait_work wait[IOWAIT_SES]; +}; + +#define SDMA_AVAIL_REASON 0 + +void iowait_set_flag(struct iowait *wait, u32 flag); +bool iowait_flag_set(struct iowait *wait, u32 flag); +void iowait_clear_flag(struct iowait *wait, u32 flag); + +void iowait_init( + struct iowait *wait, + u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)( + struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + unsigned seq), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)); + +/** + * iowait_schedule() - schedule the default send engine work + * @wait: wait struct to schedule + * @wq: workqueue for schedule + * @cpu: cpu + */ +static inline bool iowait_schedule( + struct iowait *wait, + struct workqueue_struct *wq, + int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); +} + +/** + * iowait_tid_schedule - schedule the tid SE + * @wait: the iowait structure + * @wq: the work queue + * @cpu: the cpu + */ +static inline bool iowait_tid_schedule( + struct iowait *wait, + struct workqueue_struct *wq, + int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork); +} + +/** + * iowait_sdma_drain() - wait for DMAs to drain + * @wait: iowait structure + * + * This will delay until the iowait sdmas have + * completed. + */ +static inline void iowait_sdma_drain(struct iowait *wait) +{ + wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy)); +} + +/** + * iowait_sdma_pending() - return sdma pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_sdma_pending(struct iowait *wait) +{ + return atomic_read(&wait->sdma_busy); +} + +/** + * iowait_sdma_inc - note sdma io pending + * @wait: iowait structure + */ +static inline void iowait_sdma_inc(struct iowait *wait) +{ + atomic_inc(&wait->sdma_busy); +} + +/** + * iowait_sdma_add - add count to pending + * @wait: iowait_work structure + */ +static inline void iowait_sdma_add(struct iowait *wait, int count) +{ + atomic_add(count, &wait->sdma_busy); +} + +/** + * iowait_pio_drain() - wait for pios to drain + * + * @wait: iowait structure + * + * This will delay until the iowait pios have + * completed. + */ +static inline void iowait_pio_drain(struct iowait *wait) +{ + wait_event_timeout(wait->wait_pio, + !atomic_read(&wait->pio_busy), + HZ); +} + +/** + * iowait_pio_pending() - return pio pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_pio_pending(struct iowait *w) +{ + return atomic_read(&w->pio_busy); +} + +/** + * iowait_drain_wakeup() - trigger iowait_drain() waiter + * @wait: iowait structure + * + * This will trigger any waiters. + */ +static inline void iowait_drain_wakeup(struct iowait *w) +{ + wake_up(&w->wait_dma); + wake_up(&w->wait_pio); + if (w->sdma_drained) + w->sdma_drained(w); +} + +/** + * iowait_pio_inc - note pio pending + * @wait: iowait structure + */ +static inline void iowait_pio_inc(struct iowait *wait) +{ + atomic_inc(&wait->pio_busy); +} + +/** + * iowait_pio_dec - note pio complete + * @wait: iowait structure + */ +static inline int iowait_pio_dec(struct iowait *wait) +{ + if (!wait) + return 0; + return atomic_dec_and_test(&wait->pio_busy); +} + +/** + * iowait_sdma_dec - note pio complete + * @wait: iowait structure + */ +static inline int iowait_sdma_dec(struct iowait *wait) +{ + if (!wait) + return 0; + return atomic_dec_and_test(&wait->sdma_busy); +} + +/** + * iowait_get_txhead() - get packet off of iowait list + * @wait wait struture + */ +static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&wait->tx_head)) { + tx = list_first_entry( + &wait->tx_head, + struct sdma_txreq, + list); + list_del_init(&tx->list); + } + return tx; +} + +static inline u16 iowait_get_desc(struct iowait_work *w) +{ + u16 num_desc = 0; + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry( + &w->tx_head, + struct sdma_txreq, + list); + num_desc = tx->num_desc; + } + return num_desc; +} + +static inline u32 iowait_get_all_desc(struct iowait *w) +{ + u32 num_desc = 0; + + num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]); + num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]); + return num_desc; +} + +/** + * iowait_packet_queued() - determine if a packet it queued + * @wait: the wait structure + */ +static inline bool iowait_packet_queued(struct iowait_work *w) +{ + return !list_empty(&w->tx_head); +} + +/** + * inc_wait_count - increment wait counts + * @w: the log work struct + * @n: the count + */ +static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n) +{ + if (!w) + return; + w->iow->tx_count++; + w->iow->count += n; +} + +/** + * iowait_get_tid_work - return iowait_work for tid SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_tid_work(struct iowait *w) +{ + return &w->wait[IOWAIT_TID_SE]; +} + +/** + * iowait_get_ib_work - return iowait_work for ib SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_ib_work(struct iowait *w) +{ + return &w->wait[IOWAIT_IB_SE]; +} + +/** + * iowait_ioww_to_iow - return iowait given iowait_work + * @w: the iowait_work struct + */ +static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w) +{ + if (likely(w)) + return w->iow; + return NULL; +} + +void iowait_cancel_work(struct iowait *w); +int iowait_set_work_flag(struct iowait_work *w); + +#endif diff --git a/kernel/include/hfi1/sdma.h b/kernel/include/hfi1/sdma.h index 7edcaaa7..7a9201e1 100644 --- a/kernel/include/hfi1/sdma.h +++ b/kernel/include/hfi1/sdma.h @@ -47,11 +47,6 @@ * */ -#include -#include - -#ifdef __HFI1_ORIG__ - #include #include #include @@ -62,11 +57,6 @@ #include "verbs.h" #include "sdma_txreq.h" -#define hfi1_cdbg(which, fmt, ...) \ - __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) -extern void __hfi1_trace_AIOWRITE(const char *func, char *fmt, ...); -#endif /* __HFI1_ORIG__ */ - /* Hardware limit */ #define MAX_DESC 64 /* Hardware limit for SDMA packet size */ @@ -202,7 +192,6 @@ struct sdma_set_state_action { unsigned go_s99_running_totrue:1; }; -#ifdef __HFI1_ORIG__ struct sdma_state { struct kref kref; struct completion comp; @@ -214,11 +203,6 @@ struct sdma_state { unsigned previous_op; enum sdma_events last_event; }; -#else -struct sdma_state { - enum sdma_states current_state; -}; -#endif /* __HFI1_ORIG__ */ /** * DOC: sdma exported routines @@ -410,7 +394,6 @@ struct sdma_engine { /* private: */ struct list_head dmawait; -#ifdef __HFI1_ORIG__ /* CONFIG SDMA for now, just blindly duplicate */ /* private: */ struct tasklet_struct sdma_hw_clean_up_task @@ -426,20 +409,14 @@ struct sdma_engine { u32 progress_check_head; /* private: */ struct work_struct flush_worker; -#endif /* __HFI1_ORIG__ */ /* protect flush list */ spinlock_t flushlist_lock; /* private: */ struct list_head flushlist; -#ifdef __HFI1_ORIG__ struct cpumask cpu_mask; struct kobject kobj; -#endif /* __HFI1_ORIG__ */ }; - -#ifdef __HFI1_ORIG__ - int sdma_init(struct hfi1_devdata *dd, u8 port); void sdma_start(struct hfi1_devdata *dd); void sdma_exit(struct hfi1_devdata *dd); @@ -464,7 +441,6 @@ static inline int sdma_empty(struct sdma_engine *sde) return sde->descq_tail == sde->descq_head; } -#endif /* __HFI1_ORIG__ */ static inline u16 sdma_descq_freecnt(struct sdma_engine *sde) { return sde->descq_cnt - @@ -502,11 +478,9 @@ static inline int sdma_running(struct sdma_engine *engine) unsigned long flags; int ret; - hfi1_cdbg(AIOWRITE, "+"); spin_lock_irqsave(&engine->tail_lock, flags); ret = __sdma_running(engine); spin_unlock_irqrestore(&engine->tail_lock, flags); - hfi1_cdbg(AIOWRITE, "-"); return ret; } @@ -645,7 +619,6 @@ static inline int sdma_txinit( { return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb); } -#ifdef __HFI1_ORIG__ /* helpers - don't use */ static inline int sdma_mapping_type(struct sdma_desc *d) @@ -666,7 +639,6 @@ static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d) >> SDMA_DESC0_PHY_ADDR_SHIFT; } -#endif /* __HFI1_ORIG__ */ static inline void make_tx_sdma_desc( struct sdma_txreq *tx, int type, @@ -694,6 +666,7 @@ static inline void make_tx_sdma_desc( int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, int type, void *kvaddr, struct page *page, unsigned long offset, u16 len); +int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *); void __sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *); static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx) @@ -701,8 +674,6 @@ static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx) if (tx->num_desc) __sdma_txclean(dd, tx); } -#ifdef __HFI1_ORIG__ -int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *); /* helpers used by public routines */ static inline void _sdma_close_tx(struct hfi1_devdata *dd, @@ -718,7 +689,6 @@ static inline void _sdma_close_tx(struct hfi1_devdata *dd, SDMA_DESC1_INT_REQ_FLAG); } -#endif /* __HFI1_ORIG__ */ static inline int _sdma_txadd_daddr( struct hfi1_devdata *dd, int type, @@ -737,13 +707,11 @@ static inline int _sdma_txadd_daddr( /* special cases for last */ if (!tx->tlen) { if (tx->packet_len & (sizeof(u32) - 1)) { - //TODO: _pad_sdma_tx_descs - //rval = _pad_sdma_tx_descs(dd, tx); + rval = _pad_sdma_tx_descs(dd, tx); if (rval) return rval; } else { - //TODO: _sdma_close_tx - //_sdma_close_tx(dd, tx); + _sdma_close_tx(dd, tx); } } tx->num_desc++; @@ -775,7 +743,7 @@ static inline int sdma_txadd_page( { dma_addr_t addr; int rval; - hfi1_cdbg(AIOWRITE, "+"); + if ((unlikely(tx->num_desc == tx->desc_limit))) { rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE, NULL, page, offset, len); @@ -783,7 +751,6 @@ static inline int sdma_txadd_page( return rval; } -#ifdef __HFI1_ORIG__ addr = dma_map_page( &dd->pcidev->dev, page, @@ -795,16 +762,7 @@ static inline int sdma_txadd_page( __sdma_txclean(dd, tx); return -ENOSPC; } -#else - //TODO: dma_map_page -#endif /* __HFI1_ORIG__ */ - hfi1_cdbg(AIOWRITE, "-"); - /* - * XXX: It seems that this is the place where the reference to - * the payload is added, but addr is kernel virtual here. - * TODO: verify this by printing it out in Linux. - */ return _sdma_txadd_daddr( dd, SDMA_MAP_PAGE, tx, addr, len); } @@ -875,7 +833,6 @@ static inline int sdma_txadd_kvaddr( return rval; } -#ifdef __HFI1_ORIG__ addr = dma_map_single( &dd->pcidev->dev, kvaddr, @@ -886,9 +843,6 @@ static inline int sdma_txadd_kvaddr( __sdma_txclean(dd, tx); return -ENOSPC; } -#else -//TODO: dma_map_single -#endif /* __HFI1_ORIG__ */ return _sdma_txadd_daddr( dd, SDMA_MAP_SINGLE, tx, addr, len); @@ -931,7 +885,6 @@ static inline u32 sdma_build_ahg_descriptor( ((data & SDMA_AHG_VALUE_MASK) << SDMA_AHG_VALUE_SHIFT)); } -#ifdef __HFI1_ORIG__ /** * sdma_progress - use seq number of detect head progress @@ -1108,7 +1061,6 @@ struct sdma_engine *sdma_select_engine_sc( u32 selector, u8 sc5); -#endif /* __HFI1_ORIG__ */ struct sdma_engine *sdma_select_engine_vl( struct hfi1_devdata *dd, u32 selector, @@ -1116,8 +1068,6 @@ struct sdma_engine *sdma_select_engine_vl( struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, u32 selector, u8 vl); -#ifdef __HFI1_ORIG__ - ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf); ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, size_t count); @@ -1145,5 +1095,4 @@ extern uint mod_num_sdma; void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); -#endif /* __HFI1_ORIG__ */ #endif diff --git a/kernel/include/hfi1/sdma_txreq.h b/kernel/include/hfi1/sdma_txreq.h new file mode 100644 index 00000000..24bb6f1b --- /dev/null +++ b/kernel/include/hfi1/sdma_txreq.h @@ -0,0 +1,135 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef HFI1_SDMA_TXREQ_H +#define HFI1_SDMA_TXREQ_H + +/* increased for AHG */ +#define NUM_DESC 6 + +/* + * struct sdma_desc - canonical fragment descriptor + * + * This is the descriptor carried in the tx request + * corresponding to each fragment. + * + */ +struct sdma_desc { + /* private: don't use directly */ + u64 qw[2]; +}; + +/** + * struct sdma_txreq - the sdma_txreq structure (one per packet) + * @list: for use by user and by queuing for wait + * + * This is the representation of a packet which consists of some + * number of fragments. Storage is provided to within the structure. + * for all fragments. + * + * The storage for the descriptors are automatically extended as needed + * when the currently allocation is exceeded. + * + * The user (Verbs or PSM) may overload this structure with fields + * specific to their use by putting this struct first in their struct. + * The method of allocation of the overloaded structure is user dependent + * + * The list is the only public field in the structure. + * + */ + +#define SDMA_TXREQ_S_OK 0 +#define SDMA_TXREQ_S_SENDERROR 1 +#define SDMA_TXREQ_S_ABORTED 2 +#define SDMA_TXREQ_S_SHUTDOWN 3 + +/* flags bits */ +#define SDMA_TXREQ_F_URGENT 0x0001 +#define SDMA_TXREQ_F_AHG_COPY 0x0002 +#define SDMA_TXREQ_F_USE_AHG 0x0004 + +struct sdma_txreq; +typedef void (*callback_t)(struct sdma_txreq *, int); + +struct iowait_wait; +struct sdma_txreq { + struct list_head list; + /* private: */ + struct sdma_desc *descp; + /* private: */ + void *coalesce_buf; + /* private: */ + struct iowait *wait; + /* private: */ + callback_t complete; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + u64 sn; +#endif + /* private: - used in coalesce/pad processing */ + u16 packet_len; + /* private: - down-counted to trigger last */ + u16 tlen; + /* private: */ + u16 num_desc; + /* private: */ + u16 desc_limit; + /* private: */ + u16 next_descq_idx; + /* private: */ + u16 coalesce_idx; + /* private: flags */ + u16 flags; + /* private: */ + struct sdma_desc descs[NUM_DESC]; +}; + +static inline int sdma_txreq_built(struct sdma_txreq *tx) +{ + return tx->num_desc; +} + +#endif /* HFI1_SDMA_TXREQ_H */ diff --git a/kernel/include/hfi1/user_exp_rcv.h b/kernel/include/hfi1/user_exp_rcv.h new file mode 100644 index 00000000..6758c6d7 --- /dev/null +++ b/kernel/include/hfi1/user_exp_rcv.h @@ -0,0 +1,157 @@ +#ifndef _HFI1_USER_EXP_RCV_H +#define _HFI1_USER_EXP_RCV_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" + +#define EXP_TID_TIDLEN_MASK 0x7FFULL +#define EXP_TID_TIDLEN_SHIFT 0 +#define EXP_TID_TIDCTRL_MASK 0x3ULL +#define EXP_TID_TIDCTRL_SHIFT 20 +#define EXP_TID_TIDIDX_MASK 0x3FFULL +#define EXP_TID_TIDIDX_SHIFT 22 +#define EXP_TID_GET(tid, field) \ + (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK) + +#define EXP_TID_SET(field, value) \ + (((value) & EXP_TID_TID##field##_MASK) << \ + EXP_TID_TID##field##_SHIFT) +#define EXP_TID_CLEAR(tid, field) ({ \ + (tid) &= ~(EXP_TID_TID##field##_MASK << \ + EXP_TID_TID##field##_SHIFT); \ + }) +#define EXP_TID_RESET(tid, field, value) do { \ + EXP_TID_CLEAR(tid, field); \ + (tid) |= EXP_TID_SET(field, (value)); \ + } while (0) + +struct tid_group { + struct list_head list; + unsigned base; + u8 size; + u8 used; + u8 map; +}; + +struct tid_pageset { + u16 idx; + u16 count; +}; + +/* + * Write an "empty" RcvArray entry. + * This function exists so the TID registaration code can use it + * to write to unused/unneeded entries and still take advantage + * of the WC performance improvements. The HFI will ignore this + * write to the RcvArray entry. + */ +static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +{ + /* + * Doing the WC fill writes only makes sense if the device is + * present and the RcvArray has been mapped as WC memory. + */ + if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) + writeq(0, dd->rcvarray_wc + (index * 8)); +} + +static inline u32 rcventry2tidinfo(u32 rcventry) +{ + u32 pair = rcventry & ~0x1; + + return EXP_TID_SET(IDX, pair >> 1) | + EXP_TID_SET(CTRL, 1 << (rcventry - pair)); +} + +static inline void exp_tid_group_init(struct exp_tid_set *set) +{ + INIT_LIST_HEAD(&set->list); + set->count = 0; +} + +static inline void tid_group_remove(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_del_init(&grp->list); + set->count--; +} + +static inline void tid_group_add_tail(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_add_tail(&grp->list, &set->list); + set->count++; +} + +static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) +{ + struct tid_group *grp = + list_first_entry(&set->list, struct tid_group, list); + list_del_init(&grp->list); + set->count--; + return grp; +} + +static inline void tid_group_move(struct tid_group *group, + struct exp_tid_set *s1, + struct exp_tid_set *s2) +{ + tid_group_remove(group, s1); + tid_group_add_tail(group, s2); +} + +u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); +int alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); +void free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); +int hfi1_user_exp_rcv_init(struct file *); +int hfi1_user_exp_rcv_free(struct hfi1_filedata *); +int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *); + +#endif /* _HFI1_USER_EXP_RCV_H */ diff --git a/kernel/include/hfi1/user_sdma.h b/kernel/include/hfi1/user_sdma.h new file mode 100644 index 00000000..1b04c7cc --- /dev/null +++ b/kernel/include/hfi1/user_sdma.h @@ -0,0 +1,128 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include + +#include "common.h" +#include "iowait.h" +#include "user_exp_rcv.h" + +extern uint extended_psn; + +/* + * Define fields in the KDETH header so we can update the header + * template. + */ +#define KDETH_OFFSET_SHIFT 0 +#define KDETH_OFFSET_MASK 0x7fff +#define KDETH_OM_SHIFT 15 +#define KDETH_OM_MASK 0x1 +#define KDETH_TID_SHIFT 16 +#define KDETH_TID_MASK 0x3ff +#define KDETH_TIDCTRL_SHIFT 26 +#define KDETH_TIDCTRL_MASK 0x3 +#define KDETH_INTR_SHIFT 28 +#define KDETH_INTR_MASK 0x1 +#define KDETH_SH_SHIFT 29 +#define KDETH_SH_MASK 0x1 +#define KDETH_KVER_SHIFT 30 +#define KDETH_KVER_MASK 0x3 +#define KDETH_JKEY_SHIFT 0x0 +#define KDETH_JKEY_MASK 0xff +#define KDETH_HCRC_UPPER_SHIFT 16 +#define KDETH_HCRC_UPPER_MASK 0xff +#define KDETH_HCRC_LOWER_SHIFT 24 +#define KDETH_HCRC_LOWER_MASK 0xff + +#define AHG_KDETH_INTR_SHIFT 12 +#define AHG_KDETH_SH_SHIFT 13 + +#define KDETH_GET(val, field) \ + (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) +#define KDETH_SET(dw, field, val) do { \ + u32 dwval = le32_to_cpu(dw); \ + dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ + dwval |= (((val) & KDETH_##field##_MASK) << \ + KDETH_##field##_SHIFT); \ + dw = cpu_to_le32(dwval); \ + } while (0) +#define KDETH_RESET(dw, field, val) ({ dw = 0; KDETH_SET(dw, field, val); }) + +/* KDETH OM multipliers and switch over point */ +#define KDETH_OM_SMALL 4 +#define KDETH_OM_LARGE 64 +#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) + +struct hfi1_user_sdma_pkt_q { + struct list_head list; + unsigned ctxt; + unsigned subctxt; + u16 n_max_reqs; + atomic_t n_reqs; + u16 reqidx; + struct hfi1_devdata *dd; + struct kmem_cache *txreq_cache; + struct user_sdma_request *reqs; + unsigned long *req_in_use; + struct iowait busy; + unsigned state; + wait_queue_head_t wait; + unsigned long unpinned; + struct mmu_rb_handler *handler; + atomic_t n_locked; + struct mm_struct *mm; +}; + +struct hfi1_user_sdma_comp_q { + u16 nentries; + struct hfi1_sdma_comp_entry *comps; +}; + +int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *); +int hfi1_user_sdma_free_queues(struct hfi1_filedata *); +int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long, + unsigned long *); diff --git a/kernel/sdma.c b/kernel/sdma.c index 62d8661f..22d2fb2e 100644 --- a/kernel/sdma.c +++ b/kernel/sdma.c @@ -45,15 +45,6 @@ * */ -#include -#include -#include -#include - -unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT; - -#ifdef __HFI1_ORIG__ - #include #include #include @@ -71,15 +62,11 @@ unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT; #include "iowait.h" #include "trace.h" -#endif /* __HFI1_ORIG__ */ - /* must be a power of 2 >= 64 <= 32768 */ #define SDMA_DESCQ_CNT 2048 #define SDMA_DESC_INTR 64 #define INVALID_TAIL 0xffff -#ifdef __HFI1_ORIG__ - static uint sdma_descq_cnt = SDMA_DESCQ_CNT; module_param(sdma_descq_cnt, uint, S_IRUGO); MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries"); @@ -239,9 +226,7 @@ static const struct sdma_set_state_action sdma_action_table[] = { }, }; -#endif /* __HFI1_ORIG__ */ #define SDMA_TAIL_UPDATE_THRESH 0x1F -#ifdef __HFI1_ORIG__ /* declare all statics here rather than keep sorting */ static void sdma_complete(struct kref *); @@ -383,7 +368,7 @@ static inline void complete_tx(struct sdma_engine *sde, /* protect against complete modifying */ struct iowait *wait = tx->wait; callback_t complete = tx->complete; - hfi1_cdbg(AIOWRITE, "+"); + #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER trace_hfi1_sdma_out_sn(sde, tx->sn); if (WARN_ON_ONCE(sde->head_sn != tx->sn)) @@ -396,7 +381,6 @@ static inline void complete_tx(struct sdma_engine *sde, (*complete)(tx, res); if (iowait_sdma_dec(wait)) iowait_drain_wakeup(wait); - hfi1_cdbg(AIOWRITE, "-"); } /* @@ -789,16 +773,11 @@ struct sdma_engine *sdma_select_engine_vl( struct sdma_map_elem *e; struct sdma_engine *rval; - hfi1_cdbg(AIOWRITE, "+"); /* NOTE This should only happen if SC->VL changed after the initial * checks on the QP/AH * Default will return engine 0 below */ -#ifdef __HFI1_ORIG__ if (vl >= num_vls) { -#else - if (vl >= HFI1_MAX_VLS_SUPPORTED) { -#endif /* __HFI1_ORIG__ */ rval = NULL; goto done; } @@ -816,7 +795,6 @@ struct sdma_engine *sdma_select_engine_vl( done: rval = !rval ? &dd->per_sdma[0] : rval; trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx); - hfi1_cdbg(AIOWRITE, "-"); return rval; } @@ -886,7 +864,6 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, const struct cpumask *current_mask = tsk_cpus_allowed(current); unsigned long cpu_id; - hfi1_cdbg(AIOWRITE, "+"); /* * To ensure that always the same sdma engine(s) will be * selected make sure the process is pinned to this CPU only. @@ -1681,7 +1658,6 @@ static inline void sdma_unmap_desc( break; } } -#endif /* __HFI1_ORIG__ */ /* * return the mode as indicated by the first @@ -1713,15 +1689,13 @@ void __sdma_txclean( if (tx->num_desc) { u8 skip = 0, mode = ahg_mode(tx); - /* TODO: enable sdma_unmap_desc */ /* unmap first */ - //sdma_unmap_desc(dd, &tx->descp[0]); + sdma_unmap_desc(dd, &tx->descp[0]); /* determine number of AHG descriptors to skip */ if (mode > SDMA_AHG_APPLY_UPDATE1) skip = mode >> 1; - /* TODO: enable sdma_unmap_desc */ - // for (i = 1 + skip; i < tx->num_desc; i++) - // sdma_unmap_desc(dd, &tx->descp[i]); + for (i = 1 + skip; i < tx->num_desc; i++) + sdma_unmap_desc(dd, &tx->descp[i]); tx->num_desc = 0; } kfree(tx->coalesce_buf); @@ -1732,7 +1706,6 @@ void __sdma_txclean( kfree(tx->descp); } } -#ifdef __HFI1_ORIG__ static inline u16 sdma_gethead(struct sdma_engine *sde) { @@ -1851,7 +1824,6 @@ static void sdma_make_progress(struct sdma_engine *sde, u64 status) u16 hwhead, swhead; int idle_check_done = 0; - hfi1_cdbg(AIOWRITE, "+"); hwhead = sdma_gethead(sde); /* The reason for some of the complexity of this code is that @@ -1903,7 +1875,6 @@ retry: sde->last_status = status; if (progress) sdma_desc_avail(sde, sdma_descq_freecnt(sde)); - hfi1_cdbg(AIOWRITE, "-"); } /* @@ -1917,7 +1888,6 @@ retry: */ void sdma_engine_interrupt(struct sdma_engine *sde, u64 status) { - hfi1_cdbg(AIOWRITE, "+"); trace_hfi1_sdma_engine_interrupt(sde, status); write_seqlock(&sde->head_lock); sdma_set_desc_cnt(sde, sdma_desct_intr); @@ -1929,7 +1899,6 @@ void sdma_engine_interrupt(struct sdma_engine *sde, u64 status) sde->sdma_int_cnt++; sdma_make_progress(sde, status); write_sequnlock(&sde->head_lock); - hfi1_cdbg(AIOWRITE, "-"); } /** @@ -2031,15 +2000,12 @@ static void sdma_setlengen(struct sdma_engine *sde) (4ULL << SD(LEN_GEN_GENERATION_SHIFT))); } -#endif /* __HFI1_ORIG__ */ static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail) { - hfi1_cdbg(AIOWRITE, "."); /* Commit writes to memory and advance the tail on the chip */ smp_wmb(); /* see get_txhead() */ writeq(tail, sde->tail_csr); } -#ifdef __HFI1_ORIG__ /* * This is called when changing to state s10_hw_start_up_halt_wait as @@ -2304,7 +2270,6 @@ void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde) } } -#endif /* __HFI1_ORIG__ */ /* * add the generation number into * the qw1 and return @@ -2341,12 +2306,12 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) u16 tail; struct sdma_desc *descp = tx->descp; u8 skip = 0, mode = ahg_mode(tx); - hfi1_cdbg(AIOWRITE, "+"); + tail = sde->descq_tail & sde->sdma_mask; sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]); sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1])); - // trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1], - // tail, &sde->descq[tail]); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1], + tail, &sde->descq[tail]); tail = ++sde->descq_tail & sde->sdma_mask; descp++; if (mode > SDMA_AHG_APPLY_UPDATE1) @@ -2364,19 +2329,18 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) qw1 = add_gen(sde, descp->qw[1]); } sde->descq[tail].qw[1] = cpu_to_le64(qw1); - // trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1, - // tail, &sde->descq[tail]); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1, + tail, &sde->descq[tail]); tail = ++sde->descq_tail & sde->sdma_mask; } tx->next_descq_idx = tail; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; - // trace_hfi1_sdma_in_sn(sde, tx->sn); + trace_hfi1_sdma_in_sn(sde, tx->sn); WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]); #endif sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx; sde->desc_avail -= tx->num_desc; - hfi1_cdbg(AIOWRITE, "-"); return tail; } @@ -2390,7 +2354,6 @@ static int sdma_check_progress( { int ret; - hfi1_cdbg(AIOWRITE, "+"); sde->desc_avail = sdma_descq_freecnt(sde); if (tx->num_desc <= sde->desc_avail) return -EAGAIN; @@ -2406,10 +2369,8 @@ static int sdma_check_progress( } else { ret = -EBUSY; } - hfi1_cdbg(AIOWRITE, "-"); return ret; } -#ifdef __HFI1_ORIG__ /** * sdma_send_txreq() - submit a tx req to ring @@ -2433,7 +2394,6 @@ int sdma_send_txreq(struct sdma_engine *sde, u16 tail; unsigned long flags; - hfi1_cdbg(AIOWRITE, "+"); /* user should have supplied entire packet */ if (unlikely(tx->tlen)) return -EINVAL; @@ -2450,7 +2410,6 @@ retry: sdma_update_tail(sde, tail); unlock: spin_unlock_irqrestore(&sde->tail_lock, flags); - hfi1_cdbg(AIOWRITE, "-"); return ret; unlock_noconn: if (wait) @@ -2477,7 +2436,6 @@ nodesc: goto unlock; } -#endif /* __HFI1_ORIG__ */ /** * sdma_send_txlist() - submit a list of tx req to ring * @sde: sdma engine to use @@ -2515,13 +2473,8 @@ int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait, u16 tail = INVALID_TAIL; u32 submit_count = 0, flush_count = 0, total_count; - hfi1_cdbg(AIOWRITE, "+"); spin_lock_irqsave(&sde->tail_lock, flags); retry: - /* - * XXX: do we really need to build tx_list?? - * We could just submit the requests straight to the HW.. - */ list_for_each_entry_safe(tx, tx_next, tx_list, list) { tx->wait = iowait_ioww_to_iow(wait); if (unlikely(!__sdma_running(sde))) @@ -2549,7 +2502,6 @@ update_tail: sdma_update_tail(sde, tail); spin_unlock_irqrestore(&sde->tail_lock, flags); *count_out = total_count; - hfi1_cdbg(AIOWRITE, "-"); return ret; unlock_noconn: spin_lock(&sde->flushlist_lock); @@ -2559,15 +2511,14 @@ unlock_noconn: tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; - // trace_hfi1_sdma_in_sn(sde, tx->sn); + trace_hfi1_sdma_in_sn(sde, tx->sn); #endif list_add_tail(&tx->list, &sde->flushlist); flush_count++; iowait_inc_wait_count(wait, tx->num_desc); } spin_unlock(&sde->flushlist_lock); - // TODO: schedule_work - //schedule_work(&sde->flush_worker); + schedule_work(&sde->flush_worker); ret = -ECOMM; goto update_tail; nodesc: @@ -2579,7 +2530,6 @@ nodesc: sde->descq_full_count++; goto update_tail; } -#ifdef __HFI1_ORIG__ static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event) { @@ -3133,7 +3083,6 @@ enomem: return -ENOMEM; } -#endif /* __HFI1_ORIG__ */ /* * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors * @@ -3154,8 +3103,6 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, int type, void *kvaddr, struct page *page, unsigned long offset, u16 len) { -//TODO: ext_coal_sdma_tx_descs -#ifdef __HFI1_ORIG__ int pad_len, rval; dma_addr_t addr; @@ -3215,10 +3162,9 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, addr, tx->tlen); } -#endif /* __HFI1_ORIG__ */ + return 1; } -#ifdef __HFI1_ORIG__ /* Update sdes when the lmc changes */ void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid) @@ -3263,7 +3209,6 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) return rval; } -#endif /* __HFI1_ORIG__ */ /* * Add ahg to the sdma_txreq * @@ -3371,7 +3316,6 @@ void sdma_ahg_free(struct sdma_engine *sde, int ahg_index) return; clear_bit(ahg_index, &sde->ahg_bits); } -#ifdef __HFI1_ORIG__ /* * SPC freeze handling for SDMA engines. Called when the driver knows @@ -3466,5 +3410,3 @@ void _sdma_engine_progress_schedule( CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)), sde->progress_mask); } - -#endif /* __HFI1_ORIG__ */ diff --git a/kernel/user_sdma.c b/kernel/user_sdma.c index aae53e5e..d626dddf 100644 --- a/kernel/user_sdma.c +++ b/kernel/user_sdma.c @@ -44,13 +44,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ - -#include -#include -#include - -#ifdef __HFI1_ORIG__ - #include #include #include @@ -76,11 +69,9 @@ #include "trace.h" #include "mmu_rb.h" - +static uint hfi1_sdma_comp_ring_size = 128; module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); -#endif /* __HFI1_ORIG__ */ -static uint hfi1_sdma_comp_ring_size = 128; /* The maximum number of Data io vectors per message/request */ #define MAX_VECTORS_PER_REQ 8 @@ -142,11 +133,8 @@ static unsigned initial_pkt_count = 8; #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ -#ifdef __HFI1_ORIG__ - struct sdma_mmu_node; -#endif /* __HFI1_ORIG__ */ struct user_sdma_iovec { struct list_head list; struct iovec iov; @@ -161,7 +149,6 @@ struct user_sdma_iovec { u64 offset; struct sdma_mmu_node *node; }; -#ifdef __HFI1_ORIG__ struct sdma_mmu_node { struct mmu_rb_node rb; @@ -177,7 +164,6 @@ struct evict_data { u32 target; /* target count to evict */ }; -#endif /* __HFI1_ORIG__ */ struct user_sdma_request { struct sdma_req_info info; struct hfi1_user_sdma_pkt_q *pq; @@ -251,7 +237,6 @@ struct user_sdma_txreq { unsigned busycount; u64 seqnum; }; -#ifdef __HFI1_ORIG__ #define SDMA_DBG(req, fmt, ...) \ hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ @@ -261,28 +246,24 @@ struct user_sdma_txreq { hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ (pq)->subctxt, ##__VA_ARGS__) -#endif /* __HFI1_ORIG__ */ - static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); +static int num_user_pages(const struct iovec *); +static void user_sdma_txreq_cb(struct sdma_txreq *, int); static inline void pq_update(struct hfi1_user_sdma_pkt_q *); +static void user_sdma_free_request(struct user_sdma_request *, bool); +static int pin_vector_pages(struct user_sdma_request *, + struct user_sdma_iovec *); +static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned, + unsigned); static int check_header_template(struct user_sdma_request *, struct hfi1_pkt_header *, u32, u32); static int set_txreq_header(struct user_sdma_request *, struct user_sdma_txreq *, u32); static int set_txreq_header_ahg(struct user_sdma_request *, struct user_sdma_txreq *, u32); -static void user_sdma_free_request(struct user_sdma_request *, bool); static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *, struct hfi1_user_sdma_comp_q *, u16, enum hfi1_sdma_comp_state, int); -static void user_sdma_txreq_cb(struct sdma_txreq *, int); - -#ifdef __HFI1_ORIG__ -static int num_user_pages(const struct iovec *); -static int pin_vector_pages(struct user_sdma_request *, - struct user_sdma_iovec *); -static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned, - unsigned); static inline u32 set_pkt_bth_psn(__be32, u8, u32); static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); @@ -495,8 +476,6 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) return 0; } -#endif /* __HFI1_ORIG__ */ - static u8 dlid_to_selector(u16 dlid) { static u8 mapping[256]; @@ -518,19 +497,11 @@ static u8 dlid_to_selector(u16 dlid) return mapping[hash]; } -#ifdef __HFI1_ORIG__ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, unsigned long dim, unsigned long *count) { int ret = 0, i; struct hfi1_filedata *fd = fp->private_data; -#else -int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, - unsigned long dim, unsigned long *count) -{ - int ret = 0, i; - struct hfi1_filedata *fd = private_data; -#endif /* __HFI1_ORIG__ */ struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_user_sdma_pkt_q *pq = fd->pq; struct hfi1_user_sdma_comp_q *cq = fd->cq; @@ -544,7 +515,6 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, u16 dlid; u32 selector; - hfi1_cdbg(AIOWRITE, "+"); if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { hfi1_cdbg( SDMA, @@ -560,8 +530,8 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, return -EFAULT; } - // trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, - // (u16 *)&info); + trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, + (u16 *)&info); if (info.comp_idx >= hfi1_sdma_comp_ring_size) { hfi1_cdbg(SDMA, @@ -611,12 +581,6 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, memcpy(&req->info, &info, sizeof(info)); - /* - * Seems like iovec[0] is always the header and if EXPECTED, - * TID is present is in iovec[dim - 1] - * The rest in between are data. - */ - if (req_opcode(info.ctrl) == EXPECTED) { /* expected must have a TID info and at least one data vector */ if (req->data_iovs < 2) { @@ -655,9 +619,6 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, ret = -EINVAL; goto free_req; } - -// TODO: Enable this validation and checking -#ifdef __HFI1_ORIG__ /* * Validate the vl. Do not trust packets from user space blindly. * VL comes from PBC, SC comes from LRH, and the VL needs to @@ -679,7 +640,6 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, ret = -EINVAL; goto free_req; } -#endif /* __HFI1_ORIG__ */ /* * Also should check the BTH.lnh. If it says the next header is GRH then @@ -706,22 +666,12 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, /* Save all the IO vector structures */ for (i = 0; i < req->data_iovs; i++) { INIT_LIST_HEAD(&req->iovs[i].list); - /* - * XXX: iovec is still user-space in McKernel here!! - * - * req->iovs[] contain only the data. - */ memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); -#ifdef __HFI1_ORIG__ - hfi1_cdbg(AIOWRITE, "+pin_vector_pages"); - // TODO: pin_vector_pages ret = pin_vector_pages(req, &req->iovs[i]); - hfi1_cdbg(AIOWRITE, "-pin_vector_pages"); if (ret) { req->status = ret; goto free_req; } -#endif /* __HFI1_ORIG__ */ req->data_len += req->iovs[i].iov.iov_len; } SDMA_DBG(req, "total data length %u", req->data_len); @@ -769,13 +719,7 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, dlid = be16_to_cpu(req->hdr.lrh[1]); selector = dlid_to_selector(dlid); selector += uctxt->ctxt + fd->subctxt; - /* TODO: check the rcu stuff */ - /* - * XXX: didn't we conclude that we don't need to worry about RCU here? - * the mapping is created at driver initialization, the rest of the - * accesses are read-only - */ - //req->sde = sdma_select_user_engine(dd, selector, vl); + req->sde = sdma_select_user_engine(dd, selector, vl); if (!req->sde || !sdma_running(req->sde)) { ret = -ECOMM; @@ -828,28 +772,20 @@ int hfi1_user_sdma_process_request(void *private_data, struct iovec *iovec, goto free_req; return ret; } -#ifdef __HFI1_ORIG__ - hfi1_cdbg(AIOWRITE, "+wait_event_interruptible_timeout"); wait_event_interruptible_timeout( pq->busy.wait_dma, (pq->state == SDMA_PKT_Q_ACTIVE), msecs_to_jiffies( SDMA_IOWAIT_TIMEOUT)); - hfi1_cdbg(AIOWRITE, "-wait_event_interruptible_timeout"); -#else - while (pq->state != SDMA_PKT_Q_ACTIVE) cpu_pause(); -#endif /* __HFI1_ORIG__ */ } } *count += idx; - hfi1_cdbg(AIOWRITE, "-"); return 0; free_req: user_sdma_free_request(req, true); if (req_queued) pq_update(pq); set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); - hfi1_cdbg(AIOWRITE, "-"); return ret; } @@ -926,7 +862,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) struct hfi1_user_sdma_pkt_q *pq = NULL; struct user_sdma_iovec *iovec = NULL; - hfi1_cdbg(AIOWRITE, "+"); if (!req->pq) return -EINVAL; @@ -964,11 +899,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) return -EFAULT; } -#ifdef __HFI1_ORIG__ tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); -#else - tx = kmalloc(sizeof(struct user_sdma_txreq), GFP_KERNEL | __GFP_ZERO); -#endif /* __HFI1_ORIG__ */ if (!tx) return -ENOMEM; @@ -1094,11 +1025,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) unsigned long base, offset; unsigned pageidx, len; - /* - * XXX: this seems to operate with the assumption - * that all user buffers need to be processed in - * PAGE_SIZE steps, how about large pages? - */ base = (unsigned long)iovec->iov.iov_base; offset = offset_in_page(base + iovec->offset + iov_offset); @@ -1161,21 +1087,14 @@ dosend: if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) sdma_ahg_free(req->sde, req->ahg_idx); } - hfi1_cdbg(AIOWRITE, "-"); return ret; free_txreq: sdma_txclean(pq->dd, &tx->txreq); free_tx: -#ifdef __HFI1_ORIG__ kmem_cache_free(pq->txreq_cache, tx); - hfi1_cdbg(AIOWRITE, "-"); -#else - kfree(tx); -#endif /* __HFI1_ORIG__ */ return ret; } -#ifdef __HFI1_ORIG__ /* * How many pages in this iovec element? @@ -1293,7 +1212,6 @@ static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, kfree(pages); } -#endif /* __HFI1_ORIG__ */ static int check_header_template(struct user_sdma_request *req, struct hfi1_pkt_header *hdr, u32 lrhlen, u32 datalen) @@ -1470,8 +1388,8 @@ static int set_txreq_header(struct user_sdma_request *req, req->omfactor != KDETH_OM_SMALL); } done: - // trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, - // req->info.comp_idx, hdr, tidval); + trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, hdr, tidval); return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); } @@ -1557,9 +1475,9 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); } - // trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, - // req->info.comp_idx, req->sde->this_idx, - // req->ahg_idx, req->ahg, diff, tidval); + trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, req->sde->this_idx, + req->ahg_idx, req->ahg, diff, tidval); return diff; } @@ -1592,8 +1510,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) } req->seqcomp = tx->seqnum; - //TODO: kmem_cache_free - //kmem_cache_free(pq->txreq_cache, tx); + kmem_cache_free(pq->txreq_cache, tx); tx = NULL; idx = req->info.comp_idx; @@ -1621,14 +1538,12 @@ static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) { if (atomic_dec_and_test(&pq->n_reqs)) { xchg(&pq->state, SDMA_PKT_Q_INACTIVE); - //TODO: wake_up - //wake_up(&pq->wait); + wake_up(&pq->wait); } } static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) { - hfi1_cdbg(AIOWRITE, "+"); if (!list_empty(&req->txps)) { struct sdma_txreq *t, *p; @@ -1637,9 +1552,7 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) container_of(t, struct user_sdma_txreq, txreq); list_del_init(&t->list); sdma_txclean(req->pq->dd, t); -#ifdef __HFI1_ORIG__ kmem_cache_free(req->pq->txreq_cache, tx); -#endif /* __HFI1_ORIG__ */ } } if (req->data_iovs) { @@ -1651,20 +1564,17 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) if (!node) continue; -//TODO: -#ifdef __HFI1_ORIG__ if (unpin) hfi1_mmu_rb_remove(req->pq->handler, &node->rb); else atomic_dec(&node->refcount); -#endif /* __HFI1_ORIG__ */ } } kfree(req->tids); clear_bit(req->info.comp_idx, req->pq->req_in_use); - hfi1_cdbg(AIOWRITE, "-"); } + static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, struct hfi1_user_sdma_comp_q *cq, u16 idx, enum hfi1_sdma_comp_state state, @@ -1675,10 +1585,9 @@ static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, cq->comps[idx].status = state; if (state == ERROR) cq->comps[idx].errcode = -ret; - // trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, - // idx, state, ret); + trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, + idx, state, ret); } -#ifdef __HFI1_ORIG__ static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, unsigned long len) @@ -1742,5 +1651,3 @@ static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) return 1; return 0; } - -#endif /* __HFI1_ORIG__ */