#include "u.h" #include "../port/lib.h" #include "mem.h" #include "dat.h" #include "fns.h" #include "io.h" #include "../xen/xen.h" #define LOG(a) #define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW } #define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR } #define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\ ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP } Segdesc gdt[NGDT] = { [NULLSEG] { 0, 0}, /* null descriptor */ [KDSEG] DATASEGM(0), /* kernel data/stack */ [KESEG] EXECSEGM(0), /* kernel code */ [UDSEG] DATASEGM(3), /* user data/stack */ [UESEG] EXECSEGM(3), /* user code */ [TSSSEG] TSSSEGM(0,0), /* tss segment */ }; static void taskswitch(/*ulong */ulong * pdb, ulong stack) { Tss *tss; tss = m->tss; tss->ss0 = KDSEL; tss->esp0 = stack; tss->ss1 = KDSEL; tss->esp1 = stack; tss->ss2 = KDSEL; tss->esp2 = stack; tss->cr3 = PADDR(pdb); HYPERVISOR_stack_switch(KDSEL, stack); putcr3(pdb); } /* * On processors that support it, we set the PTEGLOBAL bit in * page table and page directory entries that map kernel memory. * Doing this tells the processor not to bother flushing them * from the TLB when doing the TLB flush associated with a * context switch (write to CR3). Since kernel memory mappings * are never removed, this is safe. (If we ever remove kernel memory * mappings, we can do a full flush by turning off the PGE bit in CR4, * writing to CR3, and then turning the PGE bit back on.) * * See also mmukmap below. * * Processor support for the PTEGLOBAL bit is enabled in devarch.c. */ static void memglobal(void) { int i, j; ulong *pde, *pte; /* only need to do this once, on bootstrap processor */ if(m->machno != 0) return; if(!m->havepge) return; pde = m->pdb; for(i=512; i<1024; i++){ /* 512: start at entry for virtual 0x80000000 */ if(pde[i] & PTEVALID){ pde[i] |= PTEGLOBAL; if(!(pde[i] & PTESIZE)){ pte = KADDR(pde[i]&~(BY2PG-1)); for(j=0; j<1024; j++) if(pte[j] & PTEVALID) pte[j] |= PTEGLOBAL; } } } } void mmuinit(void) { ulong x; ushort ptr[3]; extern int rtsr(void); memglobal(); m->tss = malloc(sizeof(Tss)); memset(m->tss, 0, sizeof(Tss)); /* * We used to keep the GDT in the Mach structure, but it * turns out that that slows down access to the rest of the * page. Since the Mach structure is accessed quite often, * it pays off anywhere from a factor of 1.25 to 2 on real * hardware to separate them (the AMDs are more sensitive * than Intels in this regard). Under VMware it pays off * a factor of about 10 to 100. */ memmove(m->gdt, gdt, sizeof gdt); x = (ulong)m->tss; m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss); m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP; ptr[0] = sizeof(gdt)-1; x = (ulong)m->gdt; ptr[1] = x & 0xFFFF; ptr[2] = (x>>16) & 0xFFFF; LOG(dp("NOT DOING lgdt\n")); // lgdt(ptr); ptr[0] = sizeof(Segdesc)*256-1; x = IDTADDR; ptr[1] = x & 0xFFFF; ptr[2] = (x>>16) & 0xFFFF; LOG(dp("NOT DOING lidt\n")); // lidt(ptr); /* make kernel text unwritable */ LOG(dp("NOT MAKING KERNEL TEXT UNWRITABLE\n")); #ifdef not for(x = KTZERO; x < (ulong)etext; x += BY2PG){ p = mmuwalk(m->pdb, x, 2, 0); if(p == nil) panic("mmuinit"); *p &= ~PTEWRITE; } #endif LOG(dp("NOT DOING task switch or ltr\n")); // taskswitch(PADDR(m->pdb), (ulong)m + BY2PG); taskswitch(m->pdb, (ulong)m+BY2PG); #ifdef NOT ltr(TSSSEL); #endif LOG(dp("ltr is 0x%x\n", rtsr())); } void flushmmu(void) { int s; s = splhi(); up->newtlb = 1; mmuswitch(up); splx(s); } /* this can be called with an active pdb, so use Xen calls to zero it out. */ static void mmuptefree(Proc* proc) { ulong *pdb; Page **last, *page; LOG(dp("mmuptefree\n")); if(proc->mmupdb && proc->mmuused){ pdb = (ulong*)proc->mmupdb->va; LOG(dp("mmuptefree: pdb %p\n", pdb)); last = &proc->mmuused; for(page = *last; page; page = page->next){ LOG(dp("mmuptefree: free page 0x%ulx index 0x%ulx\n", page->pa, page->daddr)); queue_l2_entry_update(&pdb[page->daddr], 0); /* this is no longer a pte page. So make it readwrite */ _flush_page_update_queue(); xen_mm_readwrite((void *)page->va); //pdb[page->daddr] = 0; last = &page->next; } *last = proc->mmufree; proc->mmufree = proc->mmuused; proc->mmuused = 0; } _flush_page_update_queue(); } void mmuswitch(Proc* proc) { ulong *pdb; LOG(dp("mmuswitch\n")); if(proc->newtlb){ mmuptefree(proc); proc->newtlb = 0; } if(proc->mmupdb){ pdb = (ulong*)proc->mmupdb->va; // pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)]; queue_l2_entry_update(&pdb[PDX(MACHADDR)], m->pdb[PDX(MACHADDR)]); _flush_page_update_queue(); // pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)]; LOG(dp("MMUSWITCH: pdb[PDX(MACHADDR)] = 0x%ulx\n", m->pdb[PDX(MACHADDR)])); taskswitch((ulong *) proc->mmupdb->va /*pa*/, (ulong)(proc->kstack+KSTACK)); } else taskswitch(/*PADDR*/(m->pdb), (ulong)(proc->kstack+KSTACK)); } void mmurelease(Proc* proc) { Page *page, *next; /* * Release any pages allocated for a page directory base or page-tables * for this process: * switch to the prototype pdb for this processor (m->pdb); * call mmuptefree() to place all pages used for page-tables (proc->mmuused) * onto the process' free list (proc->mmufree). This has the side-effect of * cleaning any user entries in the pdb (proc->mmupdb); * if there's a pdb put it in the cache of pre-initialised pdb's * for this processor (m->pdbpool) or on the process' free list; * finally, place any pages freed back into the free pool (palloc). * This routine is only called from sched() with palloc locked. */ taskswitch(/*PADDR*/(m->pdb), (ulong)m + BY2PG); mmuptefree(proc); if(proc->mmupdb){ xen_mm_readwrite((void *)proc->mmupdb->va); if(m->pdbcnt > 10){ proc->mmupdb->next = proc->mmufree; proc->mmufree = proc->mmupdb; } else{ proc->mmupdb->next = m->pdbpool; m->pdbpool = proc->mmupdb; m->pdbcnt++; } proc->mmupdb = 0; } for(page = proc->mmufree; page; page = next){ next = page->next; if(--page->ref) panic("mmurelease: page->ref %d\n", page->ref); pagechainhead(page); } if(proc->mmufree && palloc.r.p) wakeup(&palloc.r); proc->mmufree = 0; } /* rules: pdb pages are write-protected coming out of pdballoc, * and only become writeable when freed. * same rules for PTEs */ static Page* mmupdballoc(void) { int s; Page *page; s = splhi(); if(m->pdbpool == 0){ spllo(); page = newpage(0, 0, 0); page->va = VA(kmap(page)); memmove((void*)page->va, m->pdb, BY2PG); } else{ page = m->pdbpool; m->pdbpool = page->next; m->pdbcnt--; } splx(s); LOG(dp("pdballoc ... do the update ... \n")); /* have to make it readonly */ xen_mm_readonly(((void *)page->va)); LOG(dp("pdballoc returns %p\n", page)); return page; } void putmmu(ulong va, ulong pa, Page*) { int pdbx; Page *page; ulong *pdb, *pte; int s; LOG(dp("putmmu for 0x%ulx, 0x%ulx, page %p, up %p\n", va, pa, p, up)); LOG(dp(" mmupdb is %p\n", up->mmupdb)); if(up->mmupdb == 0) up->mmupdb = mmupdballoc(); LOG(dp("pdb is %p\n", up->mmupdb)); pdb = (ulong*)up->mmupdb->va; pdbx = PDX(va); LOG(dp("putmmu: pdbx is 0x%x\n", pdbx)); LOG(dp("PPN(pdb[pdbx] is 0x%ulx\n", PPN(pdb[pdbx]))); if(PPN(pdb[pdbx]) == 0){ LOG(dp("putmmu: up %p\n", up)); LOG(dp("putmmu: up->mmufree %p\n", up->mmufree)); if(up->mmufree == 0){ page = newpage(1, 0, 0); page->va = VA(kmap(page)); LOG(dp("newpage, page is %p, va 0x%ulx\n", page, page->va)); } else { LOG(dp("old page, page %p, va 0x%ulx\n", page, page->va)); page = up->mmufree; up->mmufree = page->next; memset((void*)page->va, 0, BY2PG); } LOG(dp("got something ... page is %p\n")); // pdb[pdbx] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID; LOG(dp(" quee l2 entry update for %p\n", &pdb[pdbx])); xen_mm_setl2((void *)page->va, &pdb[pdbx]); page->daddr = pdbx; page->next = up->mmuused; up->mmuused = page; } pte = KADDR(PPN(xen_ma_to_pa(pdb[pdbx]))); LOG(dp("pte is %p\n", pte)); LOG(dp("pdb[pdbx] is now 0x%ulx, pte[PTX(va]] is 0x%ulx\n", pdb[pdbx], pte[PTX(va)])); LOG(dp("PTX is 0x%x, &pte[PTX(val)] is %p, set 0x%ulx\n", PTX(va), &pte[PTX(va)], pa|PTEUSER)); queue_l1_entry_update(&pte[PTX(va)], pa|PTEUSER); // pte[PTX(va)] = pa|PTEUSER; s = splhi(); queue_l2_entry_update(&pdb[PDX(MACHADDR)], m->pdb[PDX(MACHADDR)]); // pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)]; LOG(dp("pdb[PDX(MACHADDR)] = 0x%ulx\n", m->pdb[PDX(MACHADDR)])); mmuflushtlb((ulong *) up->mmupdb->/*pa*/va); LOG(dp("end of day, va 0x%ulx, pdb[pdbx] is 0x%ulx, pte[PTX] is 0x%ulx\n", va, pdb[pdbx], pte[PTX(va)])); LOG(dp("putmmu ends\n")); splx(s); } ulong* mmuwalk(ulong* pdb, ulong va, int level, int create) { ulong pa, *table; /* * Walk the page-table pointed to by pdb and return a pointer * to the entry for virtual address va at the requested level. * If the entry is invalid and create isn't requested then bail * out early. Otherwise, for the 2nd level walk, allocate a new * page-table page and register it in the 1st level. */ LOG(dp("pdb is %p\n", pdb)); table = &pdb[PDX(va)]; LOG(dp("table %p\n", table)); if(!(*table & PTEVALID) && create == 0) return 0; LOG(dp("switch on level\n")); switch(level){ default: return 0; case 1: return table; case 2: LOG(dp("case 2, table %p\n", table)); if(*table & PTESIZE) panic("mmuwalk2: va %luX entry %luX\n", va, *table); if(!(*table & PTEVALID)){ pa = PADDR(xspanalloc(BY2PG, BY2PG, 0)); *table = pa|PTEWRITE|PTEVALID; } table = KADDR(PPN(*table)); return &table[PTX(va)]; } } static Lock mmukmaplock; int mmukmapsync(ulong va) { Mach *mach0; ulong entry, *pte; mach0 = MACHP(0); LOG(dp("mmukmapsync: va 0x%ulx, mach0 %p\n", va, mach0)); LOG(dp("mach0->pdb is %p\n", mach0->pdb)); /* don't need this any more ... if (va == 0) panic("va is 0\n"); */ LOG(dp("mmuwalk to there is %p\n", mmuwalk(mach0->pdb, va, 1, 0))); ilock(&mmukmaplock); if((pte = mmuwalk(mach0->pdb, va, 1, 0)) == nil){ iunlock(&mmukmaplock); return 0; } if(!(*pte & PTESIZE) && mmuwalk(mach0->pdb, va, 2, 0) == nil){ iunlock(&mmukmaplock); return 0; } entry = *pte; if(!(m->pdb[PDX(va)] & PTEVALID)) m->pdb[PDX(va)] = entry; if(up && up->mmupdb){ ((ulong*)up->mmupdb->va)[PDX(va)] = entry; mmuflushtlb((ulong *)up->mmupdb->/*pa*/va); } else mmuflushtlb(/*PADDR*/(m->pdb)); iunlock(&mmukmaplock); return 1; } ulong mmukmap(ulong pa, ulong va, int size) { void __flush_page_update_queue(void); Mach *mach0; ulong ova, pae, *table, pgsz, *pte, x; int pse, sync; ulong vae; // panic("mmukmap"); mach0 = MACHP(0); #ifdef NOT if((mach0->cpuiddx & 0x08) && (getcr4() & 0x10)) pse = 1; else #endif pse = 0; sync = 0; pa = PPN(pa); if(va == 0) va = (ulong)KADDR(pa); else va = PPN(va); ova = va; /* for xen, the last 64 MB of virtual is disallowed. Just disallow * anything for now. */ pae = pa + size; vae = va + size; if (pa > TOM) { LOG(dp("pa 0x%ulx not allowed in XEN mode\n", pa)); return 0; } if (pae > TOM) { LOG(dp("pa end 0x%ulx not allowed in XEN mode\n", pae)); return 0; } if (va > TOM) { LOG(dp("va 0x%ulx not allowed in XEN mode\n", va)); return 0; } if (vae > TOM) { LOG(dp("vae 0x%ulx not allowed in XEN mode\n", vae)); return 0; } ilock(&mmukmaplock); while(pa < pae){ table = &mach0->pdb[PDX(va)]; /* * Possibly already mapped. */ if(*table & PTEVALID){ if(*table & PTESIZE){ panic("NO BIG PAGES"); /* * Big page. Does it fit within? * If it does, adjust pgsz so the correct end can be * returned and get out. * If not, adjust pgsz up to the next 4MB boundary * and continue. */ x = PPN(*table); if(x != pa) panic("mmukmap1: pa %luX entry %luX\n", pa, *table); x += 4*MB; if(pae <= x){ pa = pae; break; } pgsz = x - pa; pa += pgsz; va += pgsz; continue; } else{ /* * Little page. Walk to the entry. * If the entry is valid, set pgsz and continue. * If not, make it so, set pgsz, sync and continue. */ pte = mmuwalk(mach0->pdb, va, 2, 0); if(pte && *pte & PTEVALID){ x = PPN(*pte); if(x != pa) panic("mmukmap2: pa %luX entry %luX\n", pa, *pte); pgsz = BY2PG; pa += pgsz; va += pgsz; sync++; continue; } } } /* * Not mapped. Check if it can be mapped using a big page - * starts on a 4MB boundary, size >= 4MB and processor can do it. * If not a big page, walk the walk, talk the talk. * Sync is set. * * If we're creating a kernel mapping, we know that it will never * expire and thus we can set the PTEGLOBAL bit to make the entry * persist in the TLB across flushes. If we do add support later for * unmapping kernel addresses, see devarch.c for instructions on * how to do a full TLB flush. */ if(pse && (pa % (4*MB)) == 0 && (pae >= pa+4*MB)){ *table = pa|PTESIZE|PTEWRITE|PTEUNCACHED|PTEVALID; if((va&KZERO) && m->havepge) *table |= PTEGLOBAL; pgsz = 4*MB; } else{ ulong pteval; pte = mmuwalk(mach0->pdb, va, 2, 1); pteval = pa|PTEWRITE|PTEUNCACHED|PTEVALID; if((va&KZERO) && m->havepge) pteval |= PTEGLOBAL; queue_l1_entry_update(pte, pteval); pgsz = BY2PG; } pa += pgsz; va += pgsz; sync++; } iunlock(&mmukmaplock); if (sync) _flush_page_update_queue(); /* * If something was added * then need to sync up. */ if(sync) mmukmapsync(ova); return pa; }