| [34] | 1 | /****************************************************************************** |
|---|
| 2 | * xc_hvm_build.c |
|---|
| 3 | */ |
|---|
| 4 | |
|---|
| 5 | #include <stddef.h> |
|---|
| 6 | #include <inttypes.h> |
|---|
| 7 | #include <stdlib.h> |
|---|
| 8 | #include <unistd.h> |
|---|
| 9 | #include <zlib.h> |
|---|
| 10 | |
|---|
| 11 | #include "xg_private.h" |
|---|
| 12 | #include "xc_private.h" |
|---|
| 13 | |
|---|
| 14 | #include <xen/foreign/x86_32.h> |
|---|
| 15 | #include <xen/foreign/x86_64.h> |
|---|
| 16 | #include <xen/hvm/hvm_info_table.h> |
|---|
| 17 | #include <xen/hvm/params.h> |
|---|
| 18 | #include <xen/hvm/e820.h> |
|---|
| 19 | |
|---|
| 20 | #include <xen/libelf.h> |
|---|
| 21 | |
|---|
| 22 | #define SCRATCH_PFN 0xFFFFF |
|---|
| 23 | |
|---|
| 24 | /* Need to provide the right flavour of vcpu context for Xen */ |
|---|
| 25 | typedef union |
|---|
| 26 | { |
|---|
| 27 | vcpu_guest_context_x86_64_t c64; |
|---|
| 28 | vcpu_guest_context_x86_32_t c32; |
|---|
| 29 | vcpu_guest_context_t c; |
|---|
| 30 | } vcpu_guest_context_either_t; |
|---|
| 31 | |
|---|
| 32 | static void build_e820map(void *e820_page, unsigned long long mem_size) |
|---|
| 33 | { |
|---|
| 34 | struct e820entry *e820entry = |
|---|
| 35 | (struct e820entry *)(((unsigned char *)e820_page) + E820_MAP_OFFSET); |
|---|
| 36 | unsigned long long extra_mem_size = 0; |
|---|
| 37 | unsigned char nr_map = 0; |
|---|
| 38 | |
|---|
| 39 | /* |
|---|
| 40 | * Physical address space from HVM_BELOW_4G_RAM_END to 4G is reserved |
|---|
| 41 | * for PCI devices MMIO. So if HVM has more than HVM_BELOW_4G_RAM_END |
|---|
| 42 | * RAM, memory beyond HVM_BELOW_4G_RAM_END will go to 4G above. |
|---|
| 43 | */ |
|---|
| 44 | if ( mem_size > HVM_BELOW_4G_RAM_END ) |
|---|
| 45 | { |
|---|
| 46 | extra_mem_size = mem_size - HVM_BELOW_4G_RAM_END; |
|---|
| 47 | mem_size = HVM_BELOW_4G_RAM_END; |
|---|
| 48 | } |
|---|
| 49 | |
|---|
| 50 | /* 0x0-0x9FC00: Ordinary RAM. */ |
|---|
| 51 | e820entry[nr_map].addr = 0x0; |
|---|
| 52 | e820entry[nr_map].size = 0x9FC00; |
|---|
| 53 | e820entry[nr_map].type = E820_RAM; |
|---|
| 54 | nr_map++; |
|---|
| 55 | |
|---|
| 56 | /* 0x9FC00-0xA0000: Extended BIOS Data Area (EBDA). */ |
|---|
| 57 | e820entry[nr_map].addr = 0x9FC00; |
|---|
| 58 | e820entry[nr_map].size = 0x400; |
|---|
| 59 | e820entry[nr_map].type = E820_RESERVED; |
|---|
| 60 | nr_map++; |
|---|
| 61 | |
|---|
| 62 | /* |
|---|
| 63 | * Following regions are standard regions of the PC memory map. |
|---|
| 64 | * They are not covered by e820 regions. OSes will not use as RAM. |
|---|
| 65 | * 0xA0000-0xC0000: VGA memory-mapped I/O. Not covered by E820. |
|---|
| 66 | * 0xC0000-0xE0000: 16-bit devices, expansion ROMs (inc. vgabios). |
|---|
| 67 | * TODO: hvmloader should free pages which turn out to be unused. |
|---|
| 68 | */ |
|---|
| 69 | |
|---|
| 70 | /* |
|---|
| 71 | * 0xE0000-0x0F0000: PC-specific area. We place ACPI tables here. |
|---|
| 72 | * We *cannot* mark as E820_ACPI, for two reasons: |
|---|
| 73 | * 1. ACPI spec. says that E820_ACPI regions below |
|---|
| 74 | * 16MB must clip INT15h 0x88 and 0xe801 queries. |
|---|
| 75 | * Our rombios doesn't do this. |
|---|
| 76 | * 2. The OS is allowed to reclaim ACPI memory after |
|---|
| 77 | * parsing the tables. But our FACS is in this |
|---|
| 78 | * region and it must not be reclaimed (it contains |
|---|
| 79 | * the ACPI global lock!). |
|---|
| 80 | * 0xF0000-0x100000: System BIOS. |
|---|
| 81 | * TODO: hvmloader should free pages which turn out to be unused. |
|---|
| 82 | */ |
|---|
| 83 | e820entry[nr_map].addr = 0xE0000; |
|---|
| 84 | e820entry[nr_map].size = 0x20000; |
|---|
| 85 | e820entry[nr_map].type = E820_RESERVED; |
|---|
| 86 | nr_map++; |
|---|
| 87 | |
|---|
| 88 | /* Low RAM goes here. Remove 3 pages for ioreq, bufioreq, and xenstore. */ |
|---|
| 89 | e820entry[nr_map].addr = 0x100000; |
|---|
| 90 | e820entry[nr_map].size = mem_size - 0x100000 - PAGE_SIZE * 3; |
|---|
| 91 | e820entry[nr_map].type = E820_RAM; |
|---|
| 92 | nr_map++; |
|---|
| 93 | |
|---|
| 94 | /* Explicitly reserve space for special pages (ioreq and xenstore). */ |
|---|
| 95 | e820entry[nr_map].addr = mem_size - PAGE_SIZE * 3; |
|---|
| 96 | e820entry[nr_map].size = PAGE_SIZE * 3; |
|---|
| 97 | e820entry[nr_map].type = E820_RESERVED; |
|---|
| 98 | nr_map++; |
|---|
| 99 | |
|---|
| 100 | if ( extra_mem_size ) |
|---|
| 101 | { |
|---|
| 102 | e820entry[nr_map].addr = (1ULL << 32); |
|---|
| 103 | e820entry[nr_map].size = extra_mem_size; |
|---|
| 104 | e820entry[nr_map].type = E820_RAM; |
|---|
| 105 | nr_map++; |
|---|
| 106 | } |
|---|
| 107 | |
|---|
| 108 | *(((unsigned char *)e820_page) + E820_MAP_NR_OFFSET) = nr_map; |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | static int loadelfimage( |
|---|
| 112 | struct elf_binary *elf, int xch, uint32_t dom, unsigned long *parray) |
|---|
| 113 | { |
|---|
| 114 | privcmd_mmap_entry_t *entries = NULL; |
|---|
| 115 | int pages = (elf->pend - elf->pstart + PAGE_SIZE - 1) >> PAGE_SHIFT; |
|---|
| 116 | int i, rc = -1; |
|---|
| 117 | |
|---|
| 118 | /* Map address space for initial elf image. */ |
|---|
| 119 | entries = malloc(pages * sizeof(privcmd_mmap_entry_t)); |
|---|
| 120 | if ( entries == NULL ) |
|---|
| 121 | goto err; |
|---|
| 122 | elf->dest = mmap(NULL, pages << PAGE_SHIFT, PROT_READ | PROT_WRITE, |
|---|
| 123 | MAP_SHARED, xch, 0); |
|---|
| 124 | if ( elf->dest == MAP_FAILED ) |
|---|
| 125 | goto err; |
|---|
| 126 | |
|---|
| 127 | for ( i = 0; i < pages; i++ ) |
|---|
| 128 | { |
|---|
| 129 | entries[i].va = (uintptr_t)elf->dest + (i << PAGE_SHIFT); |
|---|
| 130 | entries[i].mfn = parray[(elf->pstart >> PAGE_SHIFT) + i]; |
|---|
| 131 | entries[i].npages = 1; |
|---|
| 132 | } |
|---|
| 133 | |
|---|
| 134 | rc = xc_map_foreign_ranges(xch, dom, entries, pages); |
|---|
| 135 | if ( rc < 0 ) |
|---|
| 136 | goto err; |
|---|
| 137 | |
|---|
| 138 | /* Load the initial elf image. */ |
|---|
| 139 | elf_load_binary(elf); |
|---|
| 140 | rc = 0; |
|---|
| 141 | |
|---|
| 142 | err: |
|---|
| 143 | if ( elf->dest ) |
|---|
| 144 | { |
|---|
| 145 | munmap(elf->dest, pages << PAGE_SHIFT); |
|---|
| 146 | elf->dest = NULL; |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | if ( entries ) |
|---|
| 150 | free(entries); |
|---|
| 151 | |
|---|
| 152 | return rc; |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | static int setup_guest(int xc_handle, |
|---|
| 156 | uint32_t dom, int memsize, |
|---|
| 157 | char *image, unsigned long image_size, |
|---|
| 158 | vcpu_guest_context_either_t *ctxt) |
|---|
| 159 | { |
|---|
| 160 | xen_pfn_t *page_array = NULL; |
|---|
| 161 | unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT); |
|---|
| 162 | unsigned long shared_page_nr; |
|---|
| 163 | struct xen_add_to_physmap xatp; |
|---|
| 164 | struct shared_info *shared_info; |
|---|
| 165 | void *e820_page; |
|---|
| 166 | struct elf_binary elf; |
|---|
| 167 | uint64_t v_start, v_end; |
|---|
| 168 | int rc; |
|---|
| 169 | xen_capabilities_info_t caps; |
|---|
| 170 | |
|---|
| 171 | /* An HVM guest must be initialised with at least 2MB memory. */ |
|---|
| 172 | if ( memsize < 2 ) |
|---|
| 173 | goto error_out; |
|---|
| 174 | |
|---|
| 175 | if ( elf_init(&elf, image, image_size) != 0 ) |
|---|
| 176 | goto error_out; |
|---|
| 177 | elf_parse_binary(&elf); |
|---|
| 178 | v_start = 0; |
|---|
| 179 | v_end = (unsigned long long)memsize << 20; |
|---|
| 180 | |
|---|
| 181 | if ( xc_version(xc_handle, XENVER_capabilities, &caps) != 0 ) |
|---|
| 182 | { |
|---|
| 183 | PERROR("Could not get Xen capabilities\n"); |
|---|
| 184 | goto error_out; |
|---|
| 185 | } |
|---|
| 186 | |
|---|
| 187 | if ( (elf.pstart & (PAGE_SIZE - 1)) != 0 ) |
|---|
| 188 | { |
|---|
| 189 | PERROR("Guest OS must load to a page boundary.\n"); |
|---|
| 190 | goto error_out; |
|---|
| 191 | } |
|---|
| 192 | |
|---|
| 193 | IPRINTF("VIRTUAL MEMORY ARRANGEMENT:\n" |
|---|
| 194 | " Loader: %016"PRIx64"->%016"PRIx64"\n" |
|---|
| 195 | " TOTAL: %016"PRIx64"->%016"PRIx64"\n" |
|---|
| 196 | " ENTRY ADDRESS: %016"PRIx64"\n", |
|---|
| 197 | elf.pstart, elf.pend, |
|---|
| 198 | v_start, v_end, |
|---|
| 199 | elf_uval(&elf, elf.ehdr, e_entry)); |
|---|
| 200 | |
|---|
| 201 | if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ) |
|---|
| 202 | { |
|---|
| 203 | PERROR("Could not allocate memory.\n"); |
|---|
| 204 | goto error_out; |
|---|
| 205 | } |
|---|
| 206 | |
|---|
| 207 | for ( i = 0; i < nr_pages; i++ ) |
|---|
| 208 | page_array[i] = i; |
|---|
| 209 | for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ ) |
|---|
| 210 | page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; |
|---|
| 211 | |
|---|
| 212 | /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */ |
|---|
| 213 | rc = xc_domain_memory_populate_physmap( |
|---|
| 214 | xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]); |
|---|
| 215 | if ( rc == 0 ) |
|---|
| 216 | rc = xc_domain_memory_populate_physmap( |
|---|
| 217 | xc_handle, dom, nr_pages - 0xc0, 0, 0, &page_array[0xc0]); |
|---|
| 218 | if ( rc != 0 ) |
|---|
| 219 | { |
|---|
| 220 | PERROR("Could not allocate memory for HVM guest.\n"); |
|---|
| 221 | goto error_out; |
|---|
| 222 | } |
|---|
| 223 | |
|---|
| 224 | if ( loadelfimage(&elf, xc_handle, dom, page_array) != 0 ) |
|---|
| 225 | goto error_out; |
|---|
| 226 | |
|---|
| 227 | if ( (e820_page = xc_map_foreign_range( |
|---|
| 228 | xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, |
|---|
| 229 | E820_MAP_PAGE >> PAGE_SHIFT)) == NULL ) |
|---|
| 230 | goto error_out; |
|---|
| 231 | memset(e820_page, 0, PAGE_SIZE); |
|---|
| 232 | build_e820map(e820_page, v_end); |
|---|
| 233 | munmap(e820_page, PAGE_SIZE); |
|---|
| 234 | |
|---|
| 235 | /* Map and initialise shared_info page. */ |
|---|
| 236 | xatp.domid = dom; |
|---|
| 237 | xatp.space = XENMAPSPACE_shared_info; |
|---|
| 238 | xatp.idx = 0; |
|---|
| 239 | xatp.gpfn = SCRATCH_PFN; |
|---|
| 240 | if ( (xc_memory_op(xc_handle, XENMEM_add_to_physmap, &xatp) != 0) || |
|---|
| 241 | ((shared_info = xc_map_foreign_range( |
|---|
| 242 | xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, |
|---|
| 243 | SCRATCH_PFN)) == NULL) ) |
|---|
| 244 | goto error_out; |
|---|
| 245 | memset(shared_info, 0, PAGE_SIZE); |
|---|
| 246 | /* NB. evtchn_upcall_mask is unused: leave as zero. */ |
|---|
| 247 | memset(&shared_info->evtchn_mask[0], 0xff, |
|---|
| 248 | sizeof(shared_info->evtchn_mask)); |
|---|
| 249 | munmap(shared_info, PAGE_SIZE); |
|---|
| 250 | |
|---|
| 251 | if ( v_end > HVM_BELOW_4G_RAM_END ) |
|---|
| 252 | shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1; |
|---|
| 253 | else |
|---|
| 254 | shared_page_nr = (v_end >> PAGE_SHIFT) - 1; |
|---|
| 255 | |
|---|
| 256 | /* Paranoia: clean pages. */ |
|---|
| 257 | if ( xc_clear_domain_page(xc_handle, dom, shared_page_nr) || |
|---|
| 258 | xc_clear_domain_page(xc_handle, dom, shared_page_nr-1) || |
|---|
| 259 | xc_clear_domain_page(xc_handle, dom, shared_page_nr-2) ) |
|---|
| 260 | goto error_out; |
|---|
| 261 | |
|---|
| 262 | xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1); |
|---|
| 263 | xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2); |
|---|
| 264 | xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr); |
|---|
| 265 | |
|---|
| 266 | free(page_array); |
|---|
| 267 | |
|---|
| 268 | /* Set [er]ip in the way that's right for Xen */ |
|---|
| 269 | if ( strstr(caps, "x86_64") ) |
|---|
| 270 | { |
|---|
| 271 | ctxt->c64.user_regs.rip = elf_uval(&elf, elf.ehdr, e_entry); |
|---|
| 272 | ctxt->c64.flags = VGCF_online; |
|---|
| 273 | } |
|---|
| 274 | else |
|---|
| 275 | { |
|---|
| 276 | ctxt->c32.user_regs.eip = elf_uval(&elf, elf.ehdr, e_entry); |
|---|
| 277 | ctxt->c32.flags = VGCF_online; |
|---|
| 278 | } |
|---|
| 279 | |
|---|
| 280 | return 0; |
|---|
| 281 | |
|---|
| 282 | error_out: |
|---|
| 283 | free(page_array); |
|---|
| 284 | return -1; |
|---|
| 285 | } |
|---|
| 286 | |
|---|
| 287 | static int xc_hvm_build_internal(int xc_handle, |
|---|
| 288 | uint32_t domid, |
|---|
| 289 | int memsize, |
|---|
| 290 | char *image, |
|---|
| 291 | unsigned long image_size) |
|---|
| 292 | { |
|---|
| 293 | struct xen_domctl launch_domctl; |
|---|
| 294 | vcpu_guest_context_either_t ctxt; |
|---|
| 295 | int rc; |
|---|
| 296 | |
|---|
| 297 | if ( (image == NULL) || (image_size == 0) ) |
|---|
| 298 | { |
|---|
| 299 | ERROR("Image required"); |
|---|
| 300 | goto error_out; |
|---|
| 301 | } |
|---|
| 302 | |
|---|
| 303 | memset(&ctxt, 0, sizeof(ctxt)); |
|---|
| 304 | |
|---|
| 305 | if ( setup_guest(xc_handle, domid, memsize, image, image_size, &ctxt) < 0 ) |
|---|
| 306 | { |
|---|
| 307 | goto error_out; |
|---|
| 308 | } |
|---|
| 309 | |
|---|
| 310 | if ( lock_pages(&ctxt, sizeof(ctxt) ) ) |
|---|
| 311 | { |
|---|
| 312 | PERROR("%s: ctxt mlock failed", __func__); |
|---|
| 313 | goto error_out; |
|---|
| 314 | } |
|---|
| 315 | |
|---|
| 316 | memset(&launch_domctl, 0, sizeof(launch_domctl)); |
|---|
| 317 | launch_domctl.domain = (domid_t)domid; |
|---|
| 318 | launch_domctl.u.vcpucontext.vcpu = 0; |
|---|
| 319 | set_xen_guest_handle(launch_domctl.u.vcpucontext.ctxt, &ctxt.c); |
|---|
| 320 | launch_domctl.cmd = XEN_DOMCTL_setvcpucontext; |
|---|
| 321 | rc = xc_domctl(xc_handle, &launch_domctl); |
|---|
| 322 | |
|---|
| 323 | unlock_pages(&ctxt, sizeof(ctxt)); |
|---|
| 324 | |
|---|
| 325 | return rc; |
|---|
| 326 | |
|---|
| 327 | error_out: |
|---|
| 328 | return -1; |
|---|
| 329 | } |
|---|
| 330 | |
|---|
| 331 | static inline int is_loadable_phdr(Elf32_Phdr *phdr) |
|---|
| 332 | { |
|---|
| 333 | return ((phdr->p_type == PT_LOAD) && |
|---|
| 334 | ((phdr->p_flags & (PF_W|PF_X)) != 0)); |
|---|
| 335 | } |
|---|
| 336 | |
|---|
| 337 | /* xc_hvm_build: |
|---|
| 338 | * Create a domain for a virtualized Linux, using files/filenames. |
|---|
| 339 | */ |
|---|
| 340 | int xc_hvm_build(int xc_handle, |
|---|
| 341 | uint32_t domid, |
|---|
| 342 | int memsize, |
|---|
| 343 | const char *image_name) |
|---|
| 344 | { |
|---|
| 345 | char *image; |
|---|
| 346 | int sts; |
|---|
| 347 | unsigned long image_size; |
|---|
| 348 | |
|---|
| 349 | if ( (image_name == NULL) || |
|---|
| 350 | ((image = xc_read_image(image_name, &image_size)) == NULL) ) |
|---|
| 351 | return -1; |
|---|
| 352 | |
|---|
| 353 | sts = xc_hvm_build_internal(xc_handle, domid, memsize, image, image_size); |
|---|
| 354 | |
|---|
| 355 | free(image); |
|---|
| 356 | |
|---|
| 357 | return sts; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | /* xc_hvm_build_mem: |
|---|
| 361 | * Create a domain for a virtualized Linux, using memory buffers. |
|---|
| 362 | */ |
|---|
| 363 | int xc_hvm_build_mem(int xc_handle, |
|---|
| 364 | uint32_t domid, |
|---|
| 365 | int memsize, |
|---|
| 366 | const char *image_buffer, |
|---|
| 367 | unsigned long image_size) |
|---|
| 368 | { |
|---|
| 369 | int sts; |
|---|
| 370 | unsigned long img_len; |
|---|
| 371 | char *img; |
|---|
| 372 | |
|---|
| 373 | /* Validate that there is a kernel buffer */ |
|---|
| 374 | |
|---|
| 375 | if ( (image_buffer == NULL) || (image_size == 0) ) |
|---|
| 376 | { |
|---|
| 377 | ERROR("kernel image buffer not present"); |
|---|
| 378 | return -1; |
|---|
| 379 | } |
|---|
| 380 | |
|---|
| 381 | img = xc_inflate_buffer(image_buffer, image_size, &img_len); |
|---|
| 382 | if ( img == NULL ) |
|---|
| 383 | { |
|---|
| 384 | ERROR("unable to inflate ram disk buffer"); |
|---|
| 385 | return -1; |
|---|
| 386 | } |
|---|
| 387 | |
|---|
| 388 | sts = xc_hvm_build_internal(xc_handle, domid, memsize, |
|---|
| 389 | img, img_len); |
|---|
| 390 | |
|---|
| 391 | /* xc_inflate_buffer may return the original buffer pointer (for |
|---|
| 392 | for already inflated buffers), so exercise some care in freeing */ |
|---|
| 393 | |
|---|
| 394 | if ( (img != NULL) && (img != image_buffer) ) |
|---|
| 395 | free(img); |
|---|
| 396 | |
|---|
| 397 | return sts; |
|---|
| 398 | } |
|---|
| 399 | |
|---|
| 400 | /* |
|---|
| 401 | * Local variables: |
|---|
| 402 | * mode: C |
|---|
| 403 | * c-set-style: "BSD" |
|---|
| 404 | * c-basic-offset: 4 |
|---|
| 405 | * tab-width: 4 |
|---|
| 406 | * indent-tabs-mode: nil |
|---|
| 407 | * End: |
|---|
| 408 | */ |
|---|