| 1 | /****************************************************************************** |
|---|
| 2 | * arch/x86/x86_32/mm.c |
|---|
| 3 | * |
|---|
| 4 | * Modifications to Linux original are copyright (c) 2004, K A Fraser |
|---|
| 5 | * |
|---|
| 6 | * This program is free software; you can redistribute it and/or modify |
|---|
| 7 | * it under the terms of the GNU General Public License as published by |
|---|
| 8 | * the Free Software Foundation; either version 2 of the License, or |
|---|
| 9 | * (at your option) any later version. |
|---|
| 10 | * |
|---|
| 11 | * This program is distributed in the hope that it will be useful, |
|---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 14 | * GNU General Public License for more details. |
|---|
| 15 | * |
|---|
| 16 | * You should have received a copy of the GNU General Public License |
|---|
| 17 | * along with this program; if not, write to the Free Software |
|---|
| 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|---|
| 19 | */ |
|---|
| 20 | |
|---|
| 21 | #include <xen/config.h> |
|---|
| 22 | #include <xen/lib.h> |
|---|
| 23 | #include <xen/init.h> |
|---|
| 24 | #include <xen/mm.h> |
|---|
| 25 | #include <xen/sched.h> |
|---|
| 26 | #include <xen/guest_access.h> |
|---|
| 27 | #include <asm/current.h> |
|---|
| 28 | #include <asm/page.h> |
|---|
| 29 | #include <asm/flushtlb.h> |
|---|
| 30 | #include <asm/fixmap.h> |
|---|
| 31 | #include <public/memory.h> |
|---|
| 32 | |
|---|
| 33 | unsigned int PAGE_HYPERVISOR = __PAGE_HYPERVISOR; |
|---|
| 34 | unsigned int PAGE_HYPERVISOR_NOCACHE = __PAGE_HYPERVISOR_NOCACHE; |
|---|
| 35 | |
|---|
| 36 | static unsigned long mpt_size; |
|---|
| 37 | |
|---|
| 38 | void *alloc_xen_pagetable(void) |
|---|
| 39 | { |
|---|
| 40 | extern int early_boot; |
|---|
| 41 | extern unsigned long xenheap_phys_start; |
|---|
| 42 | unsigned long mfn; |
|---|
| 43 | |
|---|
| 44 | if ( !early_boot ) |
|---|
| 45 | { |
|---|
| 46 | void *v = alloc_xenheap_page(); |
|---|
| 47 | BUG_ON(v == NULL); |
|---|
| 48 | return v; |
|---|
| 49 | } |
|---|
| 50 | |
|---|
| 51 | mfn = xenheap_phys_start >> PAGE_SHIFT; |
|---|
| 52 | xenheap_phys_start += PAGE_SIZE; |
|---|
| 53 | return mfn_to_virt(mfn); |
|---|
| 54 | } |
|---|
| 55 | |
|---|
| 56 | void free_xen_pagetable(void *v) |
|---|
| 57 | { |
|---|
| 58 | free_xenheap_page(v); |
|---|
| 59 | } |
|---|
| 60 | |
|---|
| 61 | l2_pgentry_t *virt_to_xen_l2e(unsigned long v) |
|---|
| 62 | { |
|---|
| 63 | return &idle_pg_table_l2[l2_linear_offset(v)]; |
|---|
| 64 | } |
|---|
| 65 | |
|---|
| 66 | void __init paging_init(void) |
|---|
| 67 | { |
|---|
| 68 | void *ioremap_pt; |
|---|
| 69 | unsigned long v; |
|---|
| 70 | struct page_info *pg; |
|---|
| 71 | int i; |
|---|
| 72 | |
|---|
| 73 | #ifdef CONFIG_X86_PAE |
|---|
| 74 | printk("PAE enabled, limit: %d GB\n", MACHPHYS_MBYTES); |
|---|
| 75 | #else |
|---|
| 76 | printk("PAE disabled.\n"); |
|---|
| 77 | #endif |
|---|
| 78 | |
|---|
| 79 | if ( cpu_has_pge ) |
|---|
| 80 | { |
|---|
| 81 | /* Suitable Xen mapping can be GLOBAL. */ |
|---|
| 82 | set_in_cr4(X86_CR4_PGE); |
|---|
| 83 | PAGE_HYPERVISOR |= _PAGE_GLOBAL; |
|---|
| 84 | PAGE_HYPERVISOR_NOCACHE |= _PAGE_GLOBAL; |
|---|
| 85 | /* Transform early mappings (e.g., the frametable). */ |
|---|
| 86 | for ( v = HYPERVISOR_VIRT_START; v; v += (1 << L2_PAGETABLE_SHIFT) ) |
|---|
| 87 | if ( (l2e_get_flags(idle_pg_table_l2[l2_linear_offset(v)]) & |
|---|
| 88 | (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT) ) |
|---|
| 89 | l2e_add_flags(idle_pg_table_l2[l2_linear_offset(v)], |
|---|
| 90 | _PAGE_GLOBAL); |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | /* |
|---|
| 94 | * Allocate and map the machine-to-phys table and create read-only mapping |
|---|
| 95 | * of MPT for guest-OS use. |
|---|
| 96 | */ |
|---|
| 97 | mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1; |
|---|
| 98 | mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); |
|---|
| 99 | for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) |
|---|
| 100 | { |
|---|
| 101 | if ( (pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL ) |
|---|
| 102 | panic("Not enough memory to bootstrap Xen.\n"); |
|---|
| 103 | l2e_write(&idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i], |
|---|
| 104 | l2e_from_page(pg, PAGE_HYPERVISOR | _PAGE_PSE)); |
|---|
| 105 | /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */ |
|---|
| 106 | l2e_write(&idle_pg_table_l2[l2_linear_offset(RO_MPT_VIRT_START) + i], |
|---|
| 107 | l2e_from_page( |
|---|
| 108 | pg, (__PAGE_HYPERVISOR | _PAGE_PSE) & ~_PAGE_RW)); |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | /* Fill with an obvious debug pattern. */ |
|---|
| 112 | for ( i = 0; i < (mpt_size / BYTES_PER_LONG); i++) |
|---|
| 113 | set_gpfn_from_mfn(i, 0x55555555); |
|---|
| 114 | |
|---|
| 115 | /* Create page tables for ioremap(). */ |
|---|
| 116 | for ( i = 0; i < (IOREMAP_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ ) |
|---|
| 117 | { |
|---|
| 118 | ioremap_pt = alloc_xenheap_page(); |
|---|
| 119 | clear_page(ioremap_pt); |
|---|
| 120 | l2e_write(&idle_pg_table_l2[l2_linear_offset(IOREMAP_VIRT_START) + i], |
|---|
| 121 | l2e_from_page(virt_to_page(ioremap_pt), __PAGE_HYPERVISOR)); |
|---|
| 122 | } |
|---|
| 123 | } |
|---|
| 124 | |
|---|
| 125 | void __init setup_idle_pagetable(void) |
|---|
| 126 | { |
|---|
| 127 | int i; |
|---|
| 128 | |
|---|
| 129 | for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) |
|---|
| 130 | l2e_write(&idle_pg_table_l2[l2_linear_offset(PERDOMAIN_VIRT_START)+i], |
|---|
| 131 | l2e_from_page(virt_to_page(idle_vcpu[0]->domain-> |
|---|
| 132 | arch.mm_perdomain_pt) + i, |
|---|
| 133 | __PAGE_HYPERVISOR)); |
|---|
| 134 | } |
|---|
| 135 | |
|---|
| 136 | void __init zap_low_mappings(l2_pgentry_t *base) |
|---|
| 137 | { |
|---|
| 138 | int i; |
|---|
| 139 | u32 addr; |
|---|
| 140 | |
|---|
| 141 | for ( i = 0; ; i++ ) |
|---|
| 142 | { |
|---|
| 143 | addr = i << L2_PAGETABLE_SHIFT; |
|---|
| 144 | if ( addr >= HYPERVISOR_VIRT_START ) |
|---|
| 145 | break; |
|---|
| 146 | if ( l2e_get_paddr(base[i]) != addr ) |
|---|
| 147 | continue; |
|---|
| 148 | l2e_write(&base[i], l2e_empty()); |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | flush_tlb_all_pge(); |
|---|
| 152 | } |
|---|
| 153 | |
|---|
| 154 | void subarch_init_memory(void) |
|---|
| 155 | { |
|---|
| 156 | unsigned long m2p_start_mfn; |
|---|
| 157 | unsigned int i, j; |
|---|
| 158 | |
|---|
| 159 | /* |
|---|
| 160 | * We are rather picky about the layout of 'struct page_info'. The |
|---|
| 161 | * count_info and domain fields must be adjacent, as we perform atomic |
|---|
| 162 | * 64-bit operations on them. Also, just for sanity, we assert the size |
|---|
| 163 | * of the structure here. |
|---|
| 164 | */ |
|---|
| 165 | BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) != |
|---|
| 166 | (offsetof(struct page_info, count_info) + sizeof(u32))); |
|---|
| 167 | BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0); |
|---|
| 168 | BUILD_BUG_ON(sizeof(struct page_info) != 24); |
|---|
| 169 | |
|---|
| 170 | /* M2P table is mappable read-only by privileged domains. */ |
|---|
| 171 | for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) |
|---|
| 172 | { |
|---|
| 173 | m2p_start_mfn = l2e_get_pfn( |
|---|
| 174 | idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i]); |
|---|
| 175 | for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ ) |
|---|
| 176 | { |
|---|
| 177 | struct page_info *page = mfn_to_page(m2p_start_mfn + j); |
|---|
| 178 | share_xen_page_with_privileged_guests(page, XENSHARE_readonly); |
|---|
| 179 | } |
|---|
| 180 | } |
|---|
| 181 | |
|---|
| 182 | if ( supervisor_mode_kernel ) |
|---|
| 183 | { |
|---|
| 184 | /* Guest kernel runs in ring 0, not ring 1. */ |
|---|
| 185 | struct desc_struct *d; |
|---|
| 186 | d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; |
|---|
| 187 | d[0].b &= ~_SEGMENT_DPL; |
|---|
| 188 | d[1].b &= ~_SEGMENT_DPL; |
|---|
| 189 | } |
|---|
| 190 | } |
|---|
| 191 | |
|---|
| 192 | long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg) |
|---|
| 193 | { |
|---|
| 194 | struct xen_machphys_mfn_list xmml; |
|---|
| 195 | unsigned long mfn; |
|---|
| 196 | unsigned int i, max; |
|---|
| 197 | long rc = 0; |
|---|
| 198 | |
|---|
| 199 | switch ( op ) |
|---|
| 200 | { |
|---|
| 201 | case XENMEM_machphys_mfn_list: |
|---|
| 202 | if ( copy_from_guest(&xmml, arg, 1) ) |
|---|
| 203 | return -EFAULT; |
|---|
| 204 | |
|---|
| 205 | max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21); |
|---|
| 206 | |
|---|
| 207 | for ( i = 0; i < max; i++ ) |
|---|
| 208 | { |
|---|
| 209 | mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset( |
|---|
| 210 | RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21); |
|---|
| 211 | if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) ) |
|---|
| 212 | return -EFAULT; |
|---|
| 213 | } |
|---|
| 214 | |
|---|
| 215 | xmml.nr_extents = i; |
|---|
| 216 | if ( copy_to_guest(arg, &xmml, 1) ) |
|---|
| 217 | return -EFAULT; |
|---|
| 218 | |
|---|
| 219 | break; |
|---|
| 220 | |
|---|
| 221 | default: |
|---|
| 222 | rc = -ENOSYS; |
|---|
| 223 | break; |
|---|
| 224 | } |
|---|
| 225 | |
|---|
| 226 | return rc; |
|---|
| 227 | } |
|---|
| 228 | |
|---|
| 229 | long do_stack_switch(unsigned long ss, unsigned long esp) |
|---|
| 230 | { |
|---|
| 231 | int nr = smp_processor_id(); |
|---|
| 232 | struct tss_struct *t = &init_tss[nr]; |
|---|
| 233 | |
|---|
| 234 | fixup_guest_stack_selector(current->domain, ss); |
|---|
| 235 | |
|---|
| 236 | current->arch.guest_context.kernel_ss = ss; |
|---|
| 237 | current->arch.guest_context.kernel_sp = esp; |
|---|
| 238 | t->ss1 = ss; |
|---|
| 239 | t->esp1 = esp; |
|---|
| 240 | |
|---|
| 241 | return 0; |
|---|
| 242 | } |
|---|
| 243 | |
|---|
| 244 | /* Returns TRUE if given descriptor is valid for GDT or LDT. */ |
|---|
| 245 | int check_descriptor(const struct domain *dom, struct desc_struct *d) |
|---|
| 246 | { |
|---|
| 247 | unsigned long base, limit; |
|---|
| 248 | u32 a = d->a, b = d->b; |
|---|
| 249 | u16 cs; |
|---|
| 250 | |
|---|
| 251 | /* Let a ring0 guest kernel set any descriptor it wants to. */ |
|---|
| 252 | if ( supervisor_mode_kernel ) |
|---|
| 253 | return 1; |
|---|
| 254 | |
|---|
| 255 | /* A not-present descriptor will always fault, so is safe. */ |
|---|
| 256 | if ( !(b & _SEGMENT_P) ) |
|---|
| 257 | goto good; |
|---|
| 258 | |
|---|
| 259 | /* |
|---|
| 260 | * We don't allow a DPL of zero. There is no legitimate reason for |
|---|
| 261 | * specifying DPL==0, and it gets rather dangerous if we also accept call |
|---|
| 262 | * gates (consider a call gate pointing at another kernel descriptor with |
|---|
| 263 | * DPL 0 -- this would get the OS ring-0 privileges). |
|---|
| 264 | */ |
|---|
| 265 | if ( (b & _SEGMENT_DPL) < (GUEST_KERNEL_RPL(dom) << 13) ) |
|---|
| 266 | d->b = b = (b & ~_SEGMENT_DPL) | (GUEST_KERNEL_RPL(dom) << 13); |
|---|
| 267 | |
|---|
| 268 | if ( !(b & _SEGMENT_S) ) |
|---|
| 269 | { |
|---|
| 270 | /* |
|---|
| 271 | * System segment: |
|---|
| 272 | * 1. Don't allow interrupt or trap gates as they belong in the IDT. |
|---|
| 273 | * 2. Don't allow TSS descriptors or task gates as we don't |
|---|
| 274 | * virtualise x86 tasks. |
|---|
| 275 | * 3. Don't allow LDT descriptors because they're unnecessary and |
|---|
| 276 | * I'm uneasy about allowing an LDT page to contain LDT |
|---|
| 277 | * descriptors. In any case, Xen automatically creates the |
|---|
| 278 | * required descriptor when reloading the LDT register. |
|---|
| 279 | * 4. We allow call gates but they must not jump to a private segment. |
|---|
| 280 | */ |
|---|
| 281 | |
|---|
| 282 | /* Disallow everything but call gates. */ |
|---|
| 283 | if ( (b & _SEGMENT_TYPE) != 0xc00 ) |
|---|
| 284 | goto bad; |
|---|
| 285 | |
|---|
| 286 | /* Validate and fix up the target code selector. */ |
|---|
| 287 | cs = a >> 16; |
|---|
| 288 | fixup_guest_code_selector(dom, cs); |
|---|
| 289 | if ( !guest_gate_selector_okay(dom, cs) ) |
|---|
| 290 | goto bad; |
|---|
| 291 | a = d->a = (d->a & 0xffffU) | (cs << 16); |
|---|
| 292 | |
|---|
| 293 | /* Reserved bits must be zero. */ |
|---|
| 294 | if ( (b & 0xe0) != 0 ) |
|---|
| 295 | goto bad; |
|---|
| 296 | |
|---|
| 297 | /* No base/limit check is needed for a call gate. */ |
|---|
| 298 | goto good; |
|---|
| 299 | } |
|---|
| 300 | |
|---|
| 301 | /* Check that base is at least a page away from Xen-private area. */ |
|---|
| 302 | base = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16); |
|---|
| 303 | if ( base >= (GUEST_SEGMENT_MAX_ADDR - PAGE_SIZE) ) |
|---|
| 304 | goto bad; |
|---|
| 305 | |
|---|
| 306 | /* Check and truncate the limit if necessary. */ |
|---|
| 307 | limit = (b&0xf0000) | (a&0xffff); |
|---|
| 308 | limit++; /* We add one because limit is inclusive. */ |
|---|
| 309 | if ( (b & _SEGMENT_G) ) |
|---|
| 310 | limit <<= 12; |
|---|
| 311 | |
|---|
| 312 | if ( (b & (_SEGMENT_CODE | _SEGMENT_EC)) == _SEGMENT_EC ) |
|---|
| 313 | { |
|---|
| 314 | /* |
|---|
| 315 | * DATA, GROWS-DOWN. |
|---|
| 316 | * Grows-down limit check. |
|---|
| 317 | * NB. limit == 0xFFFFF provides no access (if G=1). |
|---|
| 318 | * limit == 0x00000 provides 4GB-4kB access (if G=1). |
|---|
| 319 | */ |
|---|
| 320 | if ( (base + limit) > base ) |
|---|
| 321 | { |
|---|
| 322 | limit = -(base & PAGE_MASK); |
|---|
| 323 | goto truncate; |
|---|
| 324 | } |
|---|
| 325 | } |
|---|
| 326 | else |
|---|
| 327 | { |
|---|
| 328 | /* |
|---|
| 329 | * DATA, GROWS-UP. |
|---|
| 330 | * CODE (CONFORMING AND NON-CONFORMING). |
|---|
| 331 | * Grows-up limit check. |
|---|
| 332 | * NB. limit == 0xFFFFF provides 4GB access (if G=1). |
|---|
| 333 | * limit == 0x00000 provides 4kB access (if G=1). |
|---|
| 334 | */ |
|---|
| 335 | if ( ((base + limit) <= base) || |
|---|
| 336 | ((base + limit) > GUEST_SEGMENT_MAX_ADDR) ) |
|---|
| 337 | { |
|---|
| 338 | limit = GUEST_SEGMENT_MAX_ADDR - base; |
|---|
| 339 | truncate: |
|---|
| 340 | if ( !(b & _SEGMENT_G) ) |
|---|
| 341 | goto bad; /* too dangerous; too hard to work out... */ |
|---|
| 342 | limit = (limit >> 12) - 1; |
|---|
| 343 | d->a &= ~0x0ffff; d->a |= limit & 0x0ffff; |
|---|
| 344 | d->b &= ~0xf0000; d->b |= limit & 0xf0000; |
|---|
| 345 | } |
|---|
| 346 | } |
|---|
| 347 | |
|---|
| 348 | good: |
|---|
| 349 | return 1; |
|---|
| 350 | bad: |
|---|
| 351 | return 0; |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | /* |
|---|
| 355 | * Local variables: |
|---|
| 356 | * mode: C |
|---|
| 357 | * c-set-style: "BSD" |
|---|
| 358 | * c-basic-offset: 4 |
|---|
| 359 | * tab-width: 4 |
|---|
| 360 | * indent-tabs-mode: nil |
|---|
| 361 | * End: |
|---|
| 362 | */ |
|---|