source: trunk/packages/xen-3.1/xen-3.1/xen/arch/x86/mm/shadow/common.c @ 34

Last change on this file since 34 was 34, checked in by hartmans, 18 years ago

Add xen and xen-common

File size: 104.3 KB
Line 
1/******************************************************************************
2 * arch/x86/mm/shadow/common.c
3 *
4 * Shadow code that does not need to be multiply compiled.
5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22 */
23
24#include <xen/config.h>
25#include <xen/types.h>
26#include <xen/mm.h>
27#include <xen/trace.h>
28#include <xen/sched.h>
29#include <xen/perfc.h>
30#include <xen/irq.h>
31#include <xen/domain_page.h>
32#include <xen/guest_access.h>
33#include <xen/keyhandler.h>
34#include <asm/event.h>
35#include <asm/page.h>
36#include <asm/current.h>
37#include <asm/flushtlb.h>
38#include <asm/shadow.h>
39#include "private.h"
40
41
42/* Set up the shadow-specific parts of a domain struct at start of day.
43 * Called for every domain from arch_domain_create() */
44void shadow_domain_init(struct domain *d)
45{
46    int i;
47    shadow_lock_init(d);
48    for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
49        INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
50    INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
51    INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
52}
53
54/* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
55 * job is to initialize the update_paging_modes() function pointer, which is
56 * used to initialized the rest of resources. Therefore, it really does not
57 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
58 * be compiled.
59 */
60void shadow_vcpu_init(struct vcpu *v)
61{
62#if CONFIG_PAGING_LEVELS == 4
63    v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
64#elif CONFIG_PAGING_LEVELS == 3
65    v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
66#elif CONFIG_PAGING_LEVELS == 2
67    v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
68#endif
69}
70
71#if SHADOW_AUDIT
72int shadow_audit_enable = 0;
73
74static void shadow_audit_key(unsigned char key)
75{
76    shadow_audit_enable = !shadow_audit_enable;
77    printk("%s shadow_audit_enable=%d\n",
78           __func__, shadow_audit_enable);
79}
80
81static int __init shadow_audit_key_init(void)
82{
83    register_keyhandler(
84        'O', shadow_audit_key,  "toggle shadow audits");
85    return 0;
86}
87__initcall(shadow_audit_key_init);
88#endif /* SHADOW_AUDIT */
89
90static void sh_free_log_dirty_bitmap(struct domain *d);
91
92int _shadow_mode_refcounts(struct domain *d)
93{
94    return shadow_mode_refcounts(d);
95}
96
97
98/**************************************************************************/
99/* x86 emulator support for the shadow code
100 */
101
102struct segment_register *hvm_get_seg_reg(
103    enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
104{
105    struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
106    if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
107        hvm_get_segment_register(current, seg, seg_reg);
108    return seg_reg;
109}
110
111enum hvm_access_type {
112    hvm_access_insn_fetch, hvm_access_read, hvm_access_write
113};
114
115static int hvm_translate_linear_addr(
116    enum x86_segment seg,
117    unsigned long offset,
118    unsigned int bytes,
119    enum hvm_access_type access_type,
120    struct sh_emulate_ctxt *sh_ctxt,
121    unsigned long *paddr)
122{
123    struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
124    unsigned long limit, addr = offset;
125    uint32_t last_byte;
126
127    if ( sh_ctxt->ctxt.addr_size != 64 )
128    {
129        /*
130         * COMPATIBILITY MODE: Apply segment checks and add base.
131         */
132
133        switch ( access_type )
134        {
135        case hvm_access_read:
136            if ( (reg->attr.fields.type & 0xa) == 0x8 )
137                goto gpf; /* execute-only code segment */
138            break;
139        case hvm_access_write:
140            if ( (reg->attr.fields.type & 0xa) != 0x2 )
141                goto gpf; /* not a writable data segment */
142            break;
143        default:
144            break;
145        }
146
147        /* Calculate the segment limit, including granularity flag. */
148        limit = reg->limit;
149        if ( reg->attr.fields.g )
150            limit = (limit << 12) | 0xfff;
151
152        last_byte = offset + bytes - 1;
153
154        /* Is this a grows-down data segment? Special limit check if so. */
155        if ( (reg->attr.fields.type & 0xc) == 0x4 )
156        {
157            /* Is upper limit 0xFFFF or 0xFFFFFFFF? */
158            if ( !reg->attr.fields.db )
159                last_byte = (uint16_t)last_byte;
160
161            /* Check first byte and last byte against respective bounds. */
162            if ( (offset <= limit) || (last_byte < offset) )
163                goto gpf;
164        }
165        else if ( (last_byte > limit) || (last_byte < offset) )
166            goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */
167
168        /*
169         * Hardware truncates to 32 bits in compatibility mode.
170         * It does not truncate to 16 bits in 16-bit address-size mode.
171         */
172        addr = (uint32_t)(addr + reg->base);
173    }
174    else
175    {
176        /*
177         * LONG MODE: FS and GS add segment base. Addresses must be canonical.
178         */
179
180        if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) )
181            addr += reg->base;
182
183        if ( !is_canonical_address(addr) )
184            goto gpf;
185    }
186
187    *paddr = addr;
188    return 0;   
189
190 gpf:
191    /* Inject #GP(0). */
192    hvm_inject_exception(TRAP_gp_fault, 0, 0);
193    return X86EMUL_EXCEPTION;
194}
195
196static int
197hvm_read(enum x86_segment seg,
198         unsigned long offset,
199         unsigned long *val,
200         unsigned int bytes,
201         enum hvm_access_type access_type,
202         struct sh_emulate_ctxt *sh_ctxt)
203{
204    unsigned long addr;
205    int rc, errcode;
206
207    rc = hvm_translate_linear_addr(
208        seg, offset, bytes, access_type, sh_ctxt, &addr);
209    if ( rc )
210        return rc;
211
212    *val = 0;
213    // XXX -- this is WRONG.
214    //        It entirely ignores the permissions in the page tables.
215    //        In this case, that is only a user vs supervisor access check.
216    //
217    if ( (rc = hvm_copy_from_guest_virt(val, addr, bytes)) == 0 )
218        return X86EMUL_OKAY;
219
220    /* If we got here, there was nothing mapped here, or a bad GFN
221     * was mapped here.  This should never happen: we're here because
222     * of a write fault at the end of the instruction we're emulating. */ 
223    SHADOW_PRINTK("read failed to va %#lx\n", addr);
224    errcode = ring_3(sh_ctxt->ctxt.regs) ? PFEC_user_mode : 0;
225    if ( access_type == hvm_access_insn_fetch )
226        errcode |= PFEC_insn_fetch;
227    hvm_inject_exception(TRAP_page_fault, errcode, addr + bytes - rc);
228    return X86EMUL_EXCEPTION;
229}
230
231static int
232hvm_emulate_read(enum x86_segment seg,
233                 unsigned long offset,
234                 unsigned long *val,
235                 unsigned int bytes,
236                 struct x86_emulate_ctxt *ctxt)
237{
238    return hvm_read(seg, offset, val, bytes, hvm_access_read,
239                    container_of(ctxt, struct sh_emulate_ctxt, ctxt));
240}
241
242static int
243hvm_emulate_insn_fetch(enum x86_segment seg,
244                       unsigned long offset,
245                       unsigned long *val,
246                       unsigned int bytes,
247                       struct x86_emulate_ctxt *ctxt)
248{
249    struct sh_emulate_ctxt *sh_ctxt =
250        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
251    unsigned int insn_off = offset - ctxt->regs->eip;
252
253    /* Fall back if requested bytes are not in the prefetch cache. */
254    if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
255        return hvm_read(seg, offset, val, bytes,
256                        hvm_access_insn_fetch, sh_ctxt);
257
258    /* Hit the cache. Simple memcpy. */
259    *val = 0;
260    memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
261    return X86EMUL_OKAY;
262}
263
264static int
265hvm_emulate_write(enum x86_segment seg,
266                  unsigned long offset,
267                  unsigned long val,
268                  unsigned int bytes,
269                  struct x86_emulate_ctxt *ctxt)
270{
271    struct sh_emulate_ctxt *sh_ctxt =
272        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
273    struct vcpu *v = current;
274    unsigned long addr;
275    int rc;
276
277    /* How many emulations could we save if we unshadowed on stack writes? */
278    if ( seg == x86_seg_ss )
279        perfc_incr(shadow_fault_emulate_stack);
280
281    rc = hvm_translate_linear_addr(
282        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
283    if ( rc )
284        return rc;
285
286    return v->arch.paging.mode->shadow.x86_emulate_write(
287        v, addr, &val, bytes, sh_ctxt);
288}
289
290static int 
291hvm_emulate_cmpxchg(enum x86_segment seg,
292                    unsigned long offset,
293                    unsigned long old,
294                    unsigned long new,
295                    unsigned int bytes,
296                    struct x86_emulate_ctxt *ctxt)
297{
298    struct sh_emulate_ctxt *sh_ctxt =
299        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
300    struct vcpu *v = current;
301    unsigned long addr;
302    int rc;
303
304    rc = hvm_translate_linear_addr(
305        seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
306    if ( rc )
307        return rc;
308
309    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
310        v, addr, old, new, bytes, sh_ctxt);
311}
312
313static int 
314hvm_emulate_cmpxchg8b(enum x86_segment seg,
315                      unsigned long offset,
316                      unsigned long old_lo,
317                      unsigned long old_hi,
318                      unsigned long new_lo,
319                      unsigned long new_hi,
320                      struct x86_emulate_ctxt *ctxt)
321{
322    struct sh_emulate_ctxt *sh_ctxt =
323        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
324    struct vcpu *v = current;
325    unsigned long addr;
326    int rc;
327
328    rc = hvm_translate_linear_addr(
329        seg, offset, 8, hvm_access_write, sh_ctxt, &addr);
330    if ( rc )
331        return rc;
332
333    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
334        v, addr, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
335}
336
337static struct x86_emulate_ops hvm_shadow_emulator_ops = {
338    .read       = hvm_emulate_read,
339    .insn_fetch = hvm_emulate_insn_fetch,
340    .write      = hvm_emulate_write,
341    .cmpxchg    = hvm_emulate_cmpxchg,
342    .cmpxchg8b  = hvm_emulate_cmpxchg8b,
343};
344
345static int
346pv_emulate_read(enum x86_segment seg,
347                unsigned long offset,
348                unsigned long *val,
349                unsigned int bytes,
350                struct x86_emulate_ctxt *ctxt)
351{
352    unsigned int rc;
353
354    *val = 0;
355    if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
356    {
357        propagate_page_fault(offset + bytes - rc, 0); /* read fault */
358        return X86EMUL_EXCEPTION;
359    }
360
361    return X86EMUL_OKAY;
362}
363
364static int
365pv_emulate_write(enum x86_segment seg,
366                 unsigned long offset,
367                 unsigned long val,
368                 unsigned int bytes,
369                 struct x86_emulate_ctxt *ctxt)
370{
371    struct sh_emulate_ctxt *sh_ctxt =
372        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
373    struct vcpu *v = current;
374    return v->arch.paging.mode->shadow.x86_emulate_write(
375        v, offset, &val, bytes, sh_ctxt);
376}
377
378static int 
379pv_emulate_cmpxchg(enum x86_segment seg,
380                   unsigned long offset,
381                   unsigned long old,
382                   unsigned long new,
383                   unsigned int bytes,
384                   struct x86_emulate_ctxt *ctxt)
385{
386    struct sh_emulate_ctxt *sh_ctxt =
387        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
388    struct vcpu *v = current;
389    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
390        v, offset, old, new, bytes, sh_ctxt);
391}
392
393static int 
394pv_emulate_cmpxchg8b(enum x86_segment seg,
395                     unsigned long offset,
396                     unsigned long old_lo,
397                     unsigned long old_hi,
398                     unsigned long new_lo,
399                     unsigned long new_hi,
400                     struct x86_emulate_ctxt *ctxt)
401{
402    struct sh_emulate_ctxt *sh_ctxt =
403        container_of(ctxt, struct sh_emulate_ctxt, ctxt);
404    struct vcpu *v = current;
405    return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
406        v, offset, old_lo, old_hi, new_lo, new_hi, sh_ctxt);
407}
408
409static struct x86_emulate_ops pv_shadow_emulator_ops = {
410    .read       = pv_emulate_read,
411    .insn_fetch = pv_emulate_read,
412    .write      = pv_emulate_write,
413    .cmpxchg    = pv_emulate_cmpxchg,
414    .cmpxchg8b  = pv_emulate_cmpxchg8b,
415};
416
417struct x86_emulate_ops *shadow_init_emulation(
418    struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
419{
420    struct segment_register *creg, *sreg;
421    struct vcpu *v = current;
422    unsigned long addr;
423
424    sh_ctxt->ctxt.regs = regs;
425
426    if ( !is_hvm_vcpu(v) )
427    {
428        sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
429        return &pv_shadow_emulator_ops;
430    }
431
432    /* Segment cache initialisation. Primed with CS. */
433    sh_ctxt->valid_seg_regs = 0;
434    creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
435
436    /* Work out the emulation mode. */
437    if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
438    {
439        sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
440    }
441    else if ( regs->eflags & X86_EFLAGS_VM )
442    {
443        sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 16;
444    }
445    else
446    {
447        sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
448        sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
449        sh_ctxt->ctxt.sp_size   = sreg->attr.fields.db ? 32 : 16;
450    }
451
452    /* Attempt to prefetch whole instruction. */
453    sh_ctxt->insn_buf_bytes =
454        (!hvm_translate_linear_addr(
455            x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
456            hvm_access_insn_fetch, sh_ctxt, &addr) &&
457         !hvm_copy_from_guest_virt(
458             sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf)))
459        ? sizeof(sh_ctxt->insn_buf) : 0;
460
461    return &hvm_shadow_emulator_ops;
462}
463
464/**************************************************************************/
465/* Code for "promoting" a guest page to the point where the shadow code is
466 * willing to let it be treated as a guest page table.  This generally
467 * involves making sure there are no writable mappings available to the guest
468 * for this page.
469 */
470void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
471{
472    struct page_info *page = mfn_to_page(gmfn);
473
474    ASSERT(mfn_valid(gmfn));
475
476    /* We should never try to promote a gmfn that has writeable mappings */
477    ASSERT(sh_remove_write_access(v, gmfn, 0, 0) == 0);
478
479    /* Is the page already shadowed? */
480    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
481        page->shadow_flags = 0;
482
483    ASSERT(!test_bit(type, &page->shadow_flags));
484    set_bit(type, &page->shadow_flags);
485}
486
487void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
488{
489    struct page_info *page = mfn_to_page(gmfn);
490
491    ASSERT(test_bit(_PGC_page_table, &page->count_info));
492    ASSERT(test_bit(type, &page->shadow_flags));
493
494    clear_bit(type, &page->shadow_flags);
495
496    if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
497    {
498        /* tlbflush timestamp field is valid again */
499        page->tlbflush_timestamp = tlbflush_current_time();
500        clear_bit(_PGC_page_table, &page->count_info);
501    }
502}
503
504/**************************************************************************/
505/* Validate a pagetable change from the guest and update the shadows.
506 * Returns a bitmask of SHADOW_SET_* flags. */
507
508int
509sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
510{
511    int result = 0;
512    struct page_info *page = mfn_to_page(gmfn);
513
514    sh_mark_dirty(v->domain, gmfn);
515   
516    // Determine which types of shadows are affected, and update each.
517    //
518    // Always validate L1s before L2s to prevent another cpu with a linear
519    // mapping of this gmfn from seeing a walk that results from
520    // using the new L2 value and the old L1 value.  (It is OK for such a
521    // guest to see a walk that uses the old L2 value with the new L1 value,
522    // as hardware could behave this way if one level of the pagewalk occurs
523    // before the store, and the next level of the pagewalk occurs after the
524    // store.
525    //
526    // Ditto for L2s before L3s, etc.
527    //
528
529    if ( !(page->count_info & PGC_page_table) )
530        return 0;  /* Not shadowed at all */
531
532#if CONFIG_PAGING_LEVELS == 2
533    if ( page->shadow_flags & SHF_L1_32 ) 
534        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
535            (v, gmfn, entry, size);
536#else
537    if ( page->shadow_flags & SHF_L1_32 ) 
538        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
539            (v, gmfn, entry, size);
540#endif
541
542#if CONFIG_PAGING_LEVELS == 2
543    if ( page->shadow_flags & SHF_L2_32 ) 
544        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
545            (v, gmfn, entry, size);
546#else
547    if ( page->shadow_flags & SHF_L2_32 ) 
548        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
549            (v, gmfn, entry, size);
550#endif
551
552#if CONFIG_PAGING_LEVELS >= 3
553    if ( page->shadow_flags & SHF_L1_PAE ) 
554        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
555            (v, gmfn, entry, size);
556    if ( page->shadow_flags & SHF_L2_PAE ) 
557        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
558            (v, gmfn, entry, size);
559    if ( page->shadow_flags & SHF_L2H_PAE ) 
560        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
561            (v, gmfn, entry, size);
562#else /* 32-bit non-PAE hypervisor does not support PAE guests */
563    ASSERT((page->shadow_flags & (SHF_L2H_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
564#endif
565
566#if CONFIG_PAGING_LEVELS >= 4
567    if ( page->shadow_flags & SHF_L1_64 ) 
568        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
569            (v, gmfn, entry, size);
570    if ( page->shadow_flags & SHF_L2_64 ) 
571        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
572            (v, gmfn, entry, size);
573    if ( page->shadow_flags & SHF_L2H_64 ) 
574        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4, 4)
575            (v, gmfn, entry, size);
576    if ( page->shadow_flags & SHF_L3_64 ) 
577        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
578            (v, gmfn, entry, size);
579    if ( page->shadow_flags & SHF_L4_64 ) 
580        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
581            (v, gmfn, entry, size);
582#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
583    ASSERT((page->shadow_flags
584            & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
585#endif
586
587    return result;
588}
589
590
591void
592sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
593                           void *entry, u32 size)
594/* This is the entry point for emulated writes to pagetables in HVM guests and
595 * PV translated guests.
596 */
597{
598    struct domain *d = v->domain;
599    int rc;
600
601    ASSERT(shadow_locked_by_me(v->domain));
602    rc = sh_validate_guest_entry(v, gmfn, entry, size);
603    if ( rc & SHADOW_SET_FLUSH )
604        /* Need to flush TLBs to pick up shadow PT changes */
605        flush_tlb_mask(d->domain_dirty_cpumask);
606    if ( rc & SHADOW_SET_ERROR ) 
607    {
608        /* This page is probably not a pagetable any more: tear it out of the
609         * shadows, along with any tables that reference it. 
610         * Since the validate call above will have made a "safe" (i.e. zero)
611         * shadow entry, we can let the domain live even if we can't fully
612         * unshadow the page. */
613        sh_remove_shadows(v, gmfn, 0, 0);
614    }
615}
616
617int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
618                             intpte_t new, mfn_t gmfn)
619/* Write a new value into the guest pagetable, and update the shadows
620 * appropriately.  Returns 0 if we page-faulted, 1 for success. */
621{
622    int failed;
623    shadow_lock(v->domain);
624    failed = __copy_to_user(p, &new, sizeof(new));
625    if ( failed != sizeof(new) )
626        sh_validate_guest_entry(v, gmfn, p, sizeof(new));
627    shadow_unlock(v->domain);
628    return (failed == 0);
629}
630
631int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
632                               intpte_t *old, intpte_t new, mfn_t gmfn)
633/* Cmpxchg a new value into the guest pagetable, and update the shadows
634 * appropriately. Returns 0 if we page-faulted, 1 if not.
635 * N.B. caller should check the value of "old" to see if the
636 * cmpxchg itself was successful. */
637{
638    int failed;
639    intpte_t t = *old;
640    shadow_lock(v->domain);
641    failed = cmpxchg_user(p, t, new);
642    if ( t == *old )
643        sh_validate_guest_entry(v, gmfn, p, sizeof(new));
644    *old = t;
645    shadow_unlock(v->domain);
646    return (failed == 0);
647}
648
649
650/**************************************************************************/
651/* Memory management for shadow pages. */ 
652
653/* Allocating shadow pages
654 * -----------------------
655 *
656 * Most shadow pages are allocated singly, but there is one case where
657 * we need to allocate multiple pages together: shadowing 32-bit guest
658 * tables on PAE or 64-bit shadows.  A 32-bit guest l1 table covers 4MB
659 * of virtuial address space, and needs to be shadowed by two PAE/64-bit
660 * l1 tables (covering 2MB of virtual address space each).  Similarly, a
661 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
662 * PAE/64-bit l2 tables (1GB va each).  These multi-page shadows are
663 * contiguous and aligned; functions for handling offsets into them are
664 * defined in shadow.c (shadow_l1_index() etc.)
665 *   
666 * This table shows the allocation behaviour of the different modes:
667 *
668 * Xen paging      32b  pae  pae  64b  64b  64b
669 * Guest paging    32b  32b  pae  32b  pae  64b
670 * PV or HVM        *   HVM   *   HVM  HVM   *
671 * Shadow paging   32b  pae  pae  pae  pae  64b
672 *
673 * sl1 size         4k   8k   4k   8k   4k   4k
674 * sl2 size         4k  16k   4k  16k   4k   4k
675 * sl3 size         -    -    -    -    -    4k
676 * sl4 size         -    -    -    -    -    4k
677 *
678 * We allocate memory from xen in four-page units and break them down
679 * with a simple buddy allocator.  Can't use the xen allocator to handle
680 * this as it only works for contiguous zones, and a domain's shadow
681 * pool is made of fragments.
682 *
683 * In HVM guests, the p2m table is built out of shadow pages, and we provide
684 * a function for the p2m management to steal pages, in max-order chunks, from
685 * the free pool.  We don't provide for giving them back, yet.
686 */
687
688/* Figure out the least acceptable quantity of shadow memory.
689 * The minimum memory requirement for always being able to free up a
690 * chunk of memory is very small -- only three max-order chunks per
691 * vcpu to hold the top level shadows and pages with Xen mappings in them. 
692 *
693 * But for a guest to be guaranteed to successfully execute a single
694 * instruction, we must be able to map a large number (about thirty) VAs
695 * at the same time, which means that to guarantee progress, we must
696 * allow for more than ninety allocated pages per vcpu.  We round that
697 * up to 128 pages, or half a megabyte per vcpu. */
698unsigned int shadow_min_acceptable_pages(struct domain *d) 
699{
700    u32 vcpu_count = 0;
701    struct vcpu *v;
702
703    for_each_vcpu(d, v)
704        vcpu_count++;
705
706    return (vcpu_count * 128);
707} 
708
709/* Figure out the order of allocation needed for a given shadow type */
710static inline u32
711shadow_order(unsigned int shadow_type) 
712{
713#if CONFIG_PAGING_LEVELS > 2
714    static const u32 type_to_order[SH_type_unused] = {
715        0, /* SH_type_none           */
716        1, /* SH_type_l1_32_shadow   */
717        1, /* SH_type_fl1_32_shadow  */
718        2, /* SH_type_l2_32_shadow   */
719        0, /* SH_type_l1_pae_shadow  */
720        0, /* SH_type_fl1_pae_shadow */
721        0, /* SH_type_l2_pae_shadow  */
722        0, /* SH_type_l2h_pae_shadow */
723        0, /* SH_type_l1_64_shadow   */
724        0, /* SH_type_fl1_64_shadow  */
725        0, /* SH_type_l2_64_shadow   */
726        0, /* SH_type_l2h_64_shadow  */
727        0, /* SH_type_l3_64_shadow   */
728        0, /* SH_type_l4_64_shadow   */
729        2, /* SH_type_p2m_table      */
730        0  /* SH_type_monitor_table  */
731        };
732    ASSERT(shadow_type < SH_type_unused);
733    return type_to_order[shadow_type];
734#else  /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
735    return 0;
736#endif
737}
738
739
740/* Do we have a free chunk of at least this order? */
741static inline int chunk_is_available(struct domain *d, int order)
742{
743    int i;
744   
745    for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
746        if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
747            return 1;
748    return 0;
749}
750
751/* Dispatcher function: call the per-mode function that will unhook the
752 * non-Xen mappings in this top-level shadow mfn */
753void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
754{
755    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
756    switch ( sp->type )
757    {
758    case SH_type_l2_32_shadow:
759#if CONFIG_PAGING_LEVELS == 2
760        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
761#else
762        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
763#endif
764        break;
765#if CONFIG_PAGING_LEVELS >= 3
766    case SH_type_l2_pae_shadow:
767    case SH_type_l2h_pae_shadow:
768        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
769        break;
770#endif
771#if CONFIG_PAGING_LEVELS >= 4
772    case SH_type_l4_64_shadow:
773        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
774        break;
775#endif
776    default:
777        SHADOW_PRINTK("top-level shadow has bad type %08x\n", sp->type);
778        BUG();
779    }
780}
781
782
783/* Make sure there is at least one chunk of the required order available
784 * in the shadow page pool. This must be called before any calls to
785 * shadow_alloc().  Since this will free existing shadows to make room,
786 * it must be called early enough to avoid freeing shadows that the
787 * caller is currently working on. */
788void shadow_prealloc(struct domain *d, unsigned int order)
789{
790    /* Need a vpcu for calling unpins; for now, since we don't have
791     * per-vcpu shadows, any will do */
792    struct vcpu *v, *v2;
793    struct list_head *l, *t;
794    struct shadow_page_info *sp;
795    cpumask_t flushmask = CPU_MASK_NONE;
796    mfn_t smfn;
797    int i;
798
799    if ( chunk_is_available(d, order) ) return; 
800   
801    v = current;
802    if ( v->domain != d )
803        v = d->vcpu[0];
804    ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus  */
805
806    /* Stage one: walk the list of pinned pages, unpinning them */
807    perfc_incr(shadow_prealloc_1);
808    list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
809    {
810        sp = list_entry(l, struct shadow_page_info, list);
811        smfn = shadow_page_to_mfn(sp);
812
813        /* Unpin this top-level shadow */
814        sh_unpin(v, smfn);
815
816        /* See if that freed up a chunk of appropriate size */
817        if ( chunk_is_available(d, order) ) return;
818    }
819
820    /* Stage two: all shadow pages are in use in hierarchies that are
821     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
822     * mappings. */
823    perfc_incr(shadow_prealloc_2);
824
825    for_each_vcpu(d, v2) 
826        for ( i = 0 ; i < 4 ; i++ )
827        {
828            if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
829            {
830                shadow_unhook_mappings(v, 
831                               pagetable_get_mfn(v2->arch.shadow_table[i]));
832                cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
833
834                /* See if that freed up a chunk of appropriate size */
835                if ( chunk_is_available(d, order) ) 
836                {
837                    flush_tlb_mask(flushmask);
838                    return;
839                }
840            }
841        }
842   
843    /* Nothing more we can do: all remaining shadows are of pages that
844     * hold Xen mappings for some vcpu.  This can never happen. */
845    SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
846                   "  shadow pages total = %u, free = %u, p2m=%u\n",
847                   1 << order, 
848                   d->arch.paging.shadow.total_pages, 
849                   d->arch.paging.shadow.free_pages, 
850                   d->arch.paging.shadow.p2m_pages);
851    BUG();
852}
853
854/* Deliberately free all the memory we can: this will tear down all of
855 * this domain's shadows */
856static void shadow_blow_tables(struct domain *d) 
857{
858    struct list_head *l, *t;
859    struct shadow_page_info *sp;
860    struct vcpu *v = d->vcpu[0];
861    mfn_t smfn;
862    int i;
863
864    ASSERT(v != NULL);
865
866    /* Pass one: unpin all pinned pages */
867    list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
868    {
869        sp = list_entry(l, struct shadow_page_info, list);
870        smfn = shadow_page_to_mfn(sp);
871        sh_unpin(v, smfn);
872    }
873       
874    /* Second pass: unhook entries of in-use shadows */
875    for_each_vcpu(d, v) 
876        for ( i = 0 ; i < 4 ; i++ )
877            if ( !pagetable_is_null(v->arch.shadow_table[i]) )
878                shadow_unhook_mappings(v, 
879                               pagetable_get_mfn(v->arch.shadow_table[i]));
880
881    /* Make sure everyone sees the unshadowings */
882    flush_tlb_mask(d->domain_dirty_cpumask);
883}
884
885
886#ifndef NDEBUG
887/* Blow all shadows of all shadowed domains: this can be used to cause the
888 * guest's pagetables to be re-shadowed if we suspect that the shadows
889 * have somehow got out of sync */
890static void shadow_blow_all_tables(unsigned char c)
891{
892    struct domain *d;
893    printk("'%c' pressed -> blowing all shadow tables\n", c);
894    rcu_read_lock(&domlist_read_lock);
895    for_each_domain(d)
896    {
897        if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
898        {
899            shadow_lock(d);
900            shadow_blow_tables(d);
901            shadow_unlock(d);
902        }
903    }
904    rcu_read_unlock(&domlist_read_lock);
905}
906
907/* Register this function in the Xen console keypress table */
908static __init int shadow_blow_tables_keyhandler_init(void)
909{
910    register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
911    return 0;
912}
913__initcall(shadow_blow_tables_keyhandler_init);
914#endif /* !NDEBUG */
915
916/* Allocate another shadow's worth of (contiguous, aligned) pages,
917 * and fill in the type and backpointer fields of their page_infos.
918 * Never fails to allocate. */
919mfn_t shadow_alloc(struct domain *d, 
920                    u32 shadow_type,
921                    unsigned long backpointer)
922{
923    struct shadow_page_info *sp = NULL;
924    unsigned int order = shadow_order(shadow_type);
925    cpumask_t mask;
926    void *p;
927    int i;
928
929    ASSERT(shadow_locked_by_me(d));
930    ASSERT(order <= SHADOW_MAX_ORDER);
931    ASSERT(shadow_type != SH_type_none);
932    perfc_incr(shadow_alloc);
933
934    /* Find smallest order which can satisfy the request. */
935    for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
936        if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
937            goto found;
938   
939    /* If we get here, we failed to allocate. This should never happen.
940     * It means that we didn't call shadow_prealloc() correctly before
941     * we allocated.  We can't recover by calling prealloc here, because
942     * we might free up higher-level pages that the caller is working on. */
943    SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
944    BUG();
945
946 found:
947    sp = list_entry(d->arch.paging.shadow.freelists[i].next, 
948                    struct shadow_page_info, list);
949    list_del(&sp->list);
950           
951    /* We may have to halve the chunk a number of times. */
952    while ( i != order )
953    {
954        i--;
955        sp->order = i;
956        list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
957        sp += 1 << i;
958    }
959    d->arch.paging.shadow.free_pages -= 1 << order;
960
961    /* Init page info fields and clear the pages */
962    for ( i = 0; i < 1<<order ; i++ ) 
963    {
964        /* Before we overwrite the old contents of this page,
965         * we need to be sure that no TLB holds a pointer to it. */
966        mask = d->domain_dirty_cpumask;
967        tlbflush_filter(mask, sp[i].tlbflush_timestamp);
968        if ( unlikely(!cpus_empty(mask)) )
969        {
970            perfc_incr(shadow_alloc_tlbflush);
971            flush_tlb_mask(mask);
972        }
973        /* Now safe to clear the page for reuse */
974        p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
975        ASSERT(p != NULL);
976        clear_page(p);
977        sh_unmap_domain_page(p);
978        INIT_LIST_HEAD(&sp[i].list);
979        sp[i].type = shadow_type;
980        sp[i].pinned = 0;
981        sp[i].count = 0;
982        sp[i].backpointer = backpointer;
983        sp[i].next_shadow = NULL;
984        perfc_incr(shadow_alloc_count);
985    }
986    return shadow_page_to_mfn(sp);
987}
988
989
990/* Return some shadow pages to the pool. */
991void shadow_free(struct domain *d, mfn_t smfn)
992{
993    struct shadow_page_info *sp = mfn_to_shadow_page(smfn); 
994    u32 shadow_type;
995    unsigned long order;
996    unsigned long mask;
997    int i;
998
999    ASSERT(shadow_locked_by_me(d));
1000    perfc_incr(shadow_free);
1001
1002    shadow_type = sp->type;
1003    ASSERT(shadow_type != SH_type_none);
1004    ASSERT(shadow_type != SH_type_p2m_table);
1005    order = shadow_order(shadow_type);
1006
1007    d->arch.paging.shadow.free_pages += 1 << order;
1008
1009    for ( i = 0; i < 1<<order; i++ ) 
1010    {
1011#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1012        struct vcpu *v;
1013        for_each_vcpu(d, v) 
1014        {
1015            /* No longer safe to look for a writeable mapping in this shadow */
1016            if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i ) 
1017                v->arch.paging.shadow.last_writeable_pte_smfn = 0;
1018        }
1019#endif
1020        /* Strip out the type: this is now a free shadow page */
1021        sp[i].type = 0;
1022        /* Remember the TLB timestamp so we will know whether to flush
1023         * TLBs when we reuse the page.  Because the destructors leave the
1024         * contents of the pages in place, we can delay TLB flushes until
1025         * just before the allocator hands the page out again. */
1026        sp[i].tlbflush_timestamp = tlbflush_current_time();
1027        perfc_decr(shadow_alloc_count);
1028    }
1029
1030    /* Merge chunks as far as possible. */
1031    while ( order < SHADOW_MAX_ORDER )
1032    {
1033        mask = 1 << order;
1034        if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
1035            /* Merge with predecessor block? */
1036            if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
1037                break;
1038            list_del(&(sp-mask)->list);
1039            sp -= mask;
1040        } else {
1041            /* Merge with successor block? */
1042            if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
1043                break;
1044            list_del(&(sp+mask)->list);
1045        }
1046        order++;
1047    }
1048
1049    sp->order = order;
1050    list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
1051}
1052
1053/* Divert some memory from the pool to be used by the p2m mapping.
1054 * This action is irreversible: the p2m mapping only ever grows.
1055 * That's OK because the p2m table only exists for translated domains,
1056 * and those domains can't ever turn off shadow mode.
1057 * Also, we only ever allocate a max-order chunk, so as to preserve
1058 * the invariant that shadow_prealloc() always works.
1059 * Returns 0 iff it can't get a chunk (the caller should then
1060 * free up some pages in domheap and call sh_set_allocation);
1061 * returns non-zero on success.
1062 */
1063static int
1064sh_alloc_p2m_pages(struct domain *d)
1065{
1066    struct page_info *pg;
1067    u32 i;
1068    ASSERT(shadow_locked_by_me(d));
1069   
1070    if ( d->arch.paging.shadow.total_pages
1071         < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
1072        return 0; /* Not enough shadow memory: need to increase it first */
1073   
1074    pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
1075    d->arch.paging.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
1076    d->arch.paging.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
1077    for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
1078    {
1079        /* Unlike shadow pages, mark p2m pages as owned by the domain.
1080         * Marking the domain as the owner would normally allow the guest to
1081         * create mappings of these pages, but these p2m pages will never be
1082         * in the domain's guest-physical address space, and so that is not
1083         * believed to be a concern.
1084         */
1085        page_set_owner(&pg[i], d);
1086        pg[i].count_info = 1;
1087        list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
1088    }
1089    return 1;
1090}
1091
1092// Returns 0 if no memory is available...
1093struct page_info * 
1094shadow_alloc_p2m_page(struct domain *d)
1095{
1096    struct list_head *entry;
1097    struct page_info *pg;
1098    mfn_t mfn;
1099    void *p;
1100   
1101    shadow_lock(d);
1102
1103    if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
1104         !sh_alloc_p2m_pages(d) )
1105    {
1106        shadow_unlock(d);
1107        return NULL;
1108    }
1109    entry = d->arch.paging.shadow.p2m_freelist.next;
1110    list_del(entry);
1111
1112    shadow_unlock(d);
1113
1114    pg = list_entry(entry, struct page_info, list);
1115    mfn = page_to_mfn(pg);
1116    p = sh_map_domain_page(mfn);
1117    clear_page(p);
1118    sh_unmap_domain_page(p);
1119
1120    return pg;
1121}
1122
1123void
1124shadow_free_p2m_page(struct domain *d, struct page_info *pg)
1125{
1126    ASSERT(page_get_owner(pg) == d);
1127    /* Should have just the one ref we gave it in alloc_p2m_page() */
1128    if ( (pg->count_info & PGC_count_mask) != 1 )
1129    {
1130        SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
1131                     pg->count_info, pg->u.inuse.type_info);
1132    }
1133    pg->count_info = 0;
1134    /* Free should not decrement domain's total allocation, since
1135     * these pages were allocated without an owner. */
1136    page_set_owner(pg, NULL); 
1137    free_domheap_pages(pg, 0);
1138    d->arch.paging.shadow.p2m_pages--;
1139    perfc_decr(shadow_alloc_count);
1140}
1141
1142#if CONFIG_PAGING_LEVELS == 3
1143static void p2m_install_entry_in_monitors(struct domain *d, 
1144                                          l3_pgentry_t *l3e) 
1145/* Special case, only used for external-mode domains on PAE hosts:
1146 * update the mapping of the p2m table.  Once again, this is trivial in
1147 * other paging modes (one top-level entry points to the top-level p2m,
1148 * no maintenance needed), but PAE makes life difficult by needing a
1149 * copy the eight l3es of the p2m table in eight l2h slots in the
1150 * monitor table.  This function makes fresh copies when a p2m l3e
1151 * changes. */
1152{
1153    l2_pgentry_t *ml2e;
1154    struct vcpu *v;
1155    unsigned int index;
1156
1157    index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
1158    ASSERT(index < MACHPHYS_MBYTES>>1);
1159
1160    for_each_vcpu(d, v) 
1161    {
1162        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
1163            continue;
1164        ASSERT(shadow_mode_external(v->domain));
1165
1166        SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
1167                      d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
1168
1169        if ( v == current ) /* OK to use linear map of monitor_table */
1170            ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
1171        else 
1172        {
1173            l3_pgentry_t *ml3e;
1174            ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
1175            ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
1176            ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
1177            ml2e += l2_table_offset(RO_MPT_VIRT_START);
1178            sh_unmap_domain_page(ml3e);
1179        }
1180        ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
1181        if ( v != current )
1182            sh_unmap_domain_page(ml2e);
1183    }
1184}
1185#endif
1186
1187/* Set the pool of shadow pages to the required number of pages.
1188 * Input will be rounded up to at least shadow_min_acceptable_pages(),
1189 * plus space for the p2m table.
1190 * Returns 0 for success, non-zero for failure. */
1191static unsigned int sh_set_allocation(struct domain *d, 
1192                                      unsigned int pages,
1193                                      int *preempted)
1194{
1195    struct shadow_page_info *sp;
1196    unsigned int lower_bound;
1197    int j;
1198
1199    ASSERT(shadow_locked_by_me(d));
1200   
1201    /* Don't allocate less than the minimum acceptable, plus one page per
1202     * megabyte of RAM (for the p2m table) */
1203    lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
1204    if ( pages > 0 && pages < lower_bound )
1205        pages = lower_bound;
1206    /* Round up to largest block size */
1207    pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
1208
1209    SHADOW_PRINTK("current %i target %i\n", 
1210                   d->arch.paging.shadow.total_pages, pages);
1211
1212    while ( d->arch.paging.shadow.total_pages != pages ) 
1213    {
1214        if ( d->arch.paging.shadow.total_pages < pages ) 
1215        {
1216            /* Need to allocate more memory from domheap */
1217            sp = (struct shadow_page_info *)
1218                alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0); 
1219            if ( sp == NULL ) 
1220            { 
1221                SHADOW_PRINTK("failed to allocate shadow pages.\n");
1222                return -ENOMEM;
1223            }
1224            d->arch.paging.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
1225            d->arch.paging.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
1226            for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ ) 
1227            {
1228                sp[j].type = 0; 
1229                sp[j].pinned = 0;
1230                sp[j].count = 0;
1231                sp[j].mbz = 0;
1232                sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
1233            }
1234            sp->order = SHADOW_MAX_ORDER;
1235            list_add_tail(&sp->list, 
1236                          &d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER]);
1237        } 
1238        else if ( d->arch.paging.shadow.total_pages > pages ) 
1239        {
1240            /* Need to return memory to domheap */
1241            shadow_prealloc(d, SHADOW_MAX_ORDER);
1242            ASSERT(!list_empty(&d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER]));
1243            sp = list_entry(d->arch.paging.shadow.freelists[SHADOW_MAX_ORDER].next, 
1244                            struct shadow_page_info, list);
1245            list_del(&sp->list);
1246            d->arch.paging.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
1247            d->arch.paging.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
1248            free_domheap_pages((struct page_info *)sp, SHADOW_MAX_ORDER);
1249        }
1250
1251        /* Check to see if we need to yield and try again */
1252        if ( preempted && hypercall_preempt_check() )
1253        {
1254            *preempted = 1;
1255            return 0;
1256        }
1257    }
1258
1259    return 0;
1260}
1261
1262/* Return the size of the shadow pool, rounded up to the nearest MB */
1263static unsigned int shadow_get_allocation(struct domain *d)
1264{
1265    unsigned int pg = d->arch.paging.shadow.total_pages;
1266    return ((pg >> (20 - PAGE_SHIFT))
1267            + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
1268}
1269
1270/**************************************************************************/
1271/* Hash table for storing the guest->shadow mappings.
1272 * The table itself is an array of pointers to shadows; the shadows are then
1273 * threaded on a singly-linked list of shadows with the same hash value */
1274
1275#define SHADOW_HASH_BUCKETS 251
1276/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
1277
1278/* Hash function that takes a gfn or mfn, plus another byte of type info */
1279typedef u32 key_t;
1280static inline key_t sh_hash(unsigned long n, unsigned int t) 
1281{
1282    unsigned char *p = (unsigned char *)&n;
1283    key_t k = t;
1284    int i;
1285    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
1286    return k % SHADOW_HASH_BUCKETS;
1287}
1288
1289#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
1290
1291/* Before we get to the mechanism, define a pair of audit functions
1292 * that sanity-check the contents of the hash table. */
1293static void sh_hash_audit_bucket(struct domain *d, int bucket)
1294/* Audit one bucket of the hash table */
1295{
1296    struct shadow_page_info *sp, *x;
1297
1298    if ( !(SHADOW_AUDIT_ENABLE) )
1299        return;
1300
1301    sp = d->arch.paging.shadow.hash_table[bucket];
1302    while ( sp )
1303    {
1304        /* Not a shadow? */
1305        BUG_ON( sp->mbz != 0 );
1306        /* Bogus type? */
1307        BUG_ON( sp->type == 0 ); 
1308        BUG_ON( sp->type > SH_type_max_shadow );
1309        /* Wrong bucket? */
1310        BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket ); 
1311        /* Duplicate entry? */
1312        for ( x = sp->next_shadow; x; x = x->next_shadow )
1313            BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
1314        /* Follow the backpointer to the guest pagetable */
1315        if ( sp->type != SH_type_fl1_32_shadow
1316             && sp->type != SH_type_fl1_pae_shadow
1317             && sp->type != SH_type_fl1_64_shadow )
1318        {
1319            struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
1320            /* Bad shadow flags on guest page? */
1321            BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
1322            /* Bad type count on guest page? */
1323            if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
1324                 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
1325            {
1326                SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
1327                             " but has typecount %#lx\n",
1328                             sp->backpointer, mfn_x(shadow_page_to_mfn(sp)), 
1329                             gpg->u.inuse.type_info);
1330                BUG();
1331            }
1332        }
1333        /* That entry was OK; on we go */
1334        sp = sp->next_shadow;
1335    }
1336}
1337
1338#else
1339#define sh_hash_audit_bucket(_d, _b) do {} while(0)
1340#endif /* Hashtable bucket audit */
1341
1342
1343#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
1344
1345static void sh_hash_audit(struct domain *d)
1346/* Full audit: audit every bucket in the table */
1347{
1348    int i;
1349
1350    if ( !(SHADOW_AUDIT_ENABLE) )
1351        return;
1352
1353    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
1354    {
1355        sh_hash_audit_bucket(d, i);
1356    }
1357}
1358
1359#else
1360#define sh_hash_audit(_d) do {} while(0)
1361#endif /* Hashtable bucket audit */
1362
1363/* Allocate and initialise the table itself. 
1364 * Returns 0 for success, 1 for error. */
1365static int shadow_hash_alloc(struct domain *d)
1366{
1367    struct shadow_page_info **table;
1368
1369    ASSERT(shadow_locked_by_me(d));
1370    ASSERT(!d->arch.paging.shadow.hash_table);
1371
1372    table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
1373    if ( !table ) return 1;
1374    memset(table, 0, 
1375           SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
1376    d->arch.paging.shadow.hash_table = table;
1377    return 0;
1378}
1379
1380/* Tear down the hash table and return all memory to Xen.
1381 * This function does not care whether the table is populated. */
1382static void shadow_hash_teardown(struct domain *d)
1383{
1384    ASSERT(shadow_locked_by_me(d));
1385    ASSERT(d->arch.paging.shadow.hash_table);
1386
1387    xfree(d->arch.paging.shadow.hash_table);
1388    d->arch.paging.shadow.hash_table = NULL;
1389}
1390
1391
1392mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
1393/* Find an entry in the hash table.  Returns the MFN of the shadow,
1394 * or INVALID_MFN if it doesn't exist */
1395{
1396    struct domain *d = v->domain;
1397    struct shadow_page_info *sp, *prev;
1398    key_t key;
1399
1400    ASSERT(shadow_locked_by_me(d));
1401    ASSERT(d->arch.paging.shadow.hash_table);
1402    ASSERT(t);
1403
1404    sh_hash_audit(d);
1405
1406    perfc_incr(shadow_hash_lookups);
1407    key = sh_hash(n, t);
1408    sh_hash_audit_bucket(d, key);
1409
1410    sp = d->arch.paging.shadow.hash_table[key];
1411    prev = NULL;
1412    while(sp)
1413    {
1414        if ( sp->backpointer == n && sp->type == t )
1415        {
1416            /* Pull-to-front if 'sp' isn't already the head item */
1417            if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
1418            {
1419                if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
1420                    /* Can't reorder: someone is walking the hash chains */
1421                    return shadow_page_to_mfn(sp);
1422                else 
1423                {
1424                    ASSERT(prev);
1425                    /* Delete sp from the list */
1426                    prev->next_shadow = sp->next_shadow;                   
1427                    /* Re-insert it at the head of the list */
1428                    sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1429                    d->arch.paging.shadow.hash_table[key] = sp;
1430                }
1431            }
1432            else
1433            {
1434                perfc_incr(shadow_hash_lookup_head);
1435            }
1436            return shadow_page_to_mfn(sp);
1437        }
1438        prev = sp;
1439        sp = sp->next_shadow;
1440    }
1441
1442    perfc_incr(shadow_hash_lookup_miss);
1443    return _mfn(INVALID_MFN);
1444}
1445
1446void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t, 
1447                        mfn_t smfn)
1448/* Put a mapping (n,t)->smfn into the hash table */
1449{
1450    struct domain *d = v->domain;
1451    struct shadow_page_info *sp;
1452    key_t key;
1453   
1454    ASSERT(shadow_locked_by_me(d));
1455    ASSERT(d->arch.paging.shadow.hash_table);
1456    ASSERT(t);
1457
1458    sh_hash_audit(d);
1459
1460    perfc_incr(shadow_hash_inserts);
1461    key = sh_hash(n, t);
1462    sh_hash_audit_bucket(d, key);
1463   
1464    /* Insert this shadow at the top of the bucket */
1465    sp = mfn_to_shadow_page(smfn);
1466    sp->next_shadow = d->arch.paging.shadow.hash_table[key];
1467    d->arch.paging.shadow.hash_table[key] = sp;
1468   
1469    sh_hash_audit_bucket(d, key);
1470}
1471
1472void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t, 
1473                        mfn_t smfn)
1474/* Excise the mapping (n,t)->smfn from the hash table */
1475{
1476    struct domain *d = v->domain;
1477    struct shadow_page_info *sp, *x;
1478    key_t key;
1479
1480    ASSERT(shadow_locked_by_me(d));
1481    ASSERT(d->arch.paging.shadow.hash_table);
1482    ASSERT(t);
1483
1484    sh_hash_audit(d);
1485
1486    perfc_incr(shadow_hash_deletes);
1487    key = sh_hash(n, t);
1488    sh_hash_audit_bucket(d, key);
1489   
1490    sp = mfn_to_shadow_page(smfn);
1491    if ( d->arch.paging.shadow.hash_table[key] == sp ) 
1492        /* Easy case: we're deleting the head item. */
1493        d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
1494    else 
1495    {
1496        /* Need to search for the one we want */
1497        x = d->arch.paging.shadow.hash_table[key];
1498        while ( 1 )
1499        {
1500            ASSERT(x); /* We can't have hit the end, since our target is
1501                        * still in the chain somehwere... */
1502            if ( x->next_shadow == sp ) 
1503            {
1504                x->next_shadow = sp->next_shadow;
1505                break;
1506            }
1507            x = x->next_shadow;
1508        }
1509    }
1510    sp->next_shadow = NULL;
1511
1512    sh_hash_audit_bucket(d, key);
1513}
1514
1515typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
1516
1517static void hash_foreach(struct vcpu *v, 
1518                         unsigned int callback_mask, 
1519                         hash_callback_t callbacks[], 
1520                         mfn_t callback_mfn)
1521/* Walk the hash table looking at the types of the entries and
1522 * calling the appropriate callback function for each entry.
1523 * The mask determines which shadow types we call back for, and the array
1524 * of callbacks tells us which function to call.
1525 * Any callback may return non-zero to let us skip the rest of the scan.
1526 *
1527 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
1528 * then return non-zero to terminate the scan. */
1529{
1530    int i, done = 0;
1531    struct domain *d = v->domain;
1532    struct shadow_page_info *x;
1533
1534    /* Say we're here, to stop hash-lookups reordering the chains */
1535    ASSERT(shadow_locked_by_me(d));
1536    ASSERT(d->arch.paging.shadow.hash_walking == 0);
1537    d->arch.paging.shadow.hash_walking = 1;
1538
1539    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
1540    {
1541        /* WARNING: This is not safe against changes to the hash table.
1542         * The callback *must* return non-zero if it has inserted or
1543         * deleted anything from the hash (lookups are OK, though). */
1544        for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
1545        {
1546            if ( callback_mask & (1 << x->type) ) 
1547            {
1548                ASSERT(x->type <= 15);
1549                ASSERT(callbacks[x->type] != NULL);
1550                done = callbacks[x->type](v, shadow_page_to_mfn(x), 
1551                                          callback_mfn);
1552                if ( done ) break;
1553            }
1554        }
1555        if ( done ) break; 
1556    }
1557    d->arch.paging.shadow.hash_walking = 0; 
1558}
1559
1560
1561/**************************************************************************/
1562/* Destroy a shadow page: simple dispatcher to call the per-type destructor
1563 * which will decrement refcounts appropriately and return memory to the
1564 * free pool. */
1565
1566void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
1567{
1568    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1569    unsigned int t = sp->type;
1570
1571
1572    SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
1573
1574    /* Double-check, if we can, that the shadowed page belongs to this
1575     * domain, (by following the back-pointer). */
1576    ASSERT(t == SH_type_fl1_32_shadow  || 
1577           t == SH_type_fl1_pae_shadow || 
1578           t == SH_type_fl1_64_shadow  || 
1579           t == SH_type_monitor_table  || 
1580           (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
1581           (page_get_owner(mfn_to_page(_mfn(sp->backpointer))) 
1582            == v->domain)); 
1583
1584    /* The down-shifts here are so that the switch statement is on nice
1585     * small numbers that the compiler will enjoy */
1586    switch ( t )
1587    {
1588#if CONFIG_PAGING_LEVELS == 2
1589    case SH_type_l1_32_shadow:
1590    case SH_type_fl1_32_shadow:
1591        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn); 
1592        break;
1593    case SH_type_l2_32_shadow:
1594        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
1595        break;
1596#else /* PAE or 64bit */
1597    case SH_type_l1_32_shadow:
1598    case SH_type_fl1_32_shadow:
1599        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
1600        break;
1601    case SH_type_l2_32_shadow:
1602        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
1603        break;
1604#endif
1605
1606#if CONFIG_PAGING_LEVELS >= 3
1607    case SH_type_l1_pae_shadow:
1608    case SH_type_fl1_pae_shadow:
1609        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
1610        break;
1611    case SH_type_l2_pae_shadow:
1612    case SH_type_l2h_pae_shadow:
1613        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
1614        break;
1615#endif
1616
1617#if CONFIG_PAGING_LEVELS >= 4
1618    case SH_type_l1_64_shadow:
1619    case SH_type_fl1_64_shadow:
1620        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
1621        break;
1622    case SH_type_l2h_64_shadow:
1623        ASSERT(is_pv_32on64_vcpu(v));
1624        /* Fall through... */
1625    case SH_type_l2_64_shadow:
1626        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
1627        break;
1628    case SH_type_l3_64_shadow:
1629        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
1630        break;
1631    case SH_type_l4_64_shadow:
1632        SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
1633        break;
1634#endif
1635    default:
1636        SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n", 
1637                       (unsigned long)t);
1638        BUG();
1639    }   
1640}
1641
1642/**************************************************************************/
1643/* Remove all writeable mappings of a guest frame from the shadow tables
1644 * Returns non-zero if we need to flush TLBs.
1645 * level and fault_addr desribe how we found this to be a pagetable;
1646 * level==0 means we have some other reason for revoking write access.*/
1647
1648int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 
1649                           unsigned int level,
1650                           unsigned long fault_addr)
1651{
1652    /* Dispatch table for getting per-type functions */
1653    static hash_callback_t callbacks[SH_type_unused] = {
1654        NULL, /* none    */
1655#if CONFIG_PAGING_LEVELS == 2
1656        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* l1_32   */
1657        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,2,2), /* fl1_32  */
1658#else
1659        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* l1_32   */
1660        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,2), /* fl1_32  */
1661#endif
1662        NULL, /* l2_32   */
1663#if CONFIG_PAGING_LEVELS >= 3
1664        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* l1_pae  */
1665        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,3,3), /* fl1_pae */
1666#else
1667        NULL, /* l1_pae  */
1668        NULL, /* fl1_pae */
1669#endif
1670        NULL, /* l2_pae  */
1671        NULL, /* l2h_pae */
1672#if CONFIG_PAGING_LEVELS >= 4
1673        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* l1_64   */
1674        SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1,4,4), /* fl1_64  */
1675#else
1676        NULL, /* l1_64   */
1677        NULL, /* fl1_64  */
1678#endif
1679        NULL, /* l2_64   */
1680        NULL, /* l2h_64  */
1681        NULL, /* l3_64   */
1682        NULL, /* l4_64   */
1683        NULL, /* p2m     */
1684        NULL  /* unused  */
1685    };
1686
1687    static unsigned int callback_mask = 
1688          1 << SH_type_l1_32_shadow
1689        | 1 << SH_type_fl1_32_shadow
1690        | 1 << SH_type_l1_pae_shadow
1691        | 1 << SH_type_fl1_pae_shadow
1692        | 1 << SH_type_l1_64_shadow
1693        | 1 << SH_type_fl1_64_shadow
1694        ;
1695    struct page_info *pg = mfn_to_page(gmfn);
1696
1697    ASSERT(shadow_locked_by_me(v->domain));
1698
1699    /* Only remove writable mappings if we are doing shadow refcounts.
1700     * In guest refcounting, we trust Xen to already be restricting
1701     * all the writes to the guest page tables, so we do not need to
1702     * do more. */
1703    if ( !shadow_mode_refcounts(v->domain) )
1704        return 0;
1705
1706    /* Early exit if it's already a pagetable, or otherwise not writeable */
1707    if ( sh_mfn_is_a_page_table(gmfn) 
1708         || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1709        return 0;
1710
1711    perfc_incr(shadow_writeable);
1712
1713    /* If this isn't a "normal" writeable page, the domain is trying to
1714     * put pagetables in special memory of some kind.  We can't allow that. */
1715    if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
1716    {
1717        SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" 
1718                      PRtype_info "\n",
1719                      mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
1720        domain_crash(v->domain);
1721    }
1722
1723#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
1724    if ( v == current && level != 0 )
1725    {
1726        unsigned long gfn;
1727        /* Heuristic: there is likely to be only one writeable mapping,
1728         * and that mapping is likely to be in the current pagetable,
1729         * in the guest's linear map (on non-HIGHPTE linux and windows)*/
1730
1731#define GUESS(_a, _h) do {                                                \
1732            if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
1733                perfc_incr(shadow_writeable_h_ ## _h);                   \
1734            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )          \
1735                return 1;                                                 \
1736        } while (0)
1737
1738       
1739        if ( v->arch.paging.mode->guest_levels == 2 )
1740        {
1741            if ( level == 1 )
1742                /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
1743                GUESS(0xC0000000UL + (fault_addr >> 10), 1);
1744
1745            /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1746            if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 ) 
1747                GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1748
1749        }
1750#if CONFIG_PAGING_LEVELS >= 3
1751        else if ( v->arch.paging.mode->guest_levels == 3 )
1752        {
1753            /* 32bit PAE w2k3: linear map at 0xC0000000 */
1754            switch ( level ) 
1755            {
1756            case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
1757            case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
1758            }
1759
1760            /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
1761            if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 ) 
1762                GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
1763        }
1764#if CONFIG_PAGING_LEVELS >= 4
1765        else if ( v->arch.paging.mode->guest_levels == 4 )
1766        {
1767            /* 64bit w2k3: linear map at 0xfffff68000000000 */
1768            switch ( level ) 
1769            {
1770            case 1: GUESS(0xfffff68000000000UL 
1771                          + ((fault_addr & VADDR_MASK) >> 9), 3); break;
1772            case 2: GUESS(0xfffff6fb40000000UL
1773                          + ((fault_addr & VADDR_MASK) >> 18), 3); break;
1774            case 3: GUESS(0xfffff6fb7da00000UL 
1775                          + ((fault_addr & VADDR_MASK) >> 27), 3); break;
1776            }
1777
1778            /* 64bit Linux direct map at 0xffff810000000000; older kernels
1779             * had it at 0x0000010000000000UL */
1780            gfn = mfn_to_gfn(v->domain, gmfn); 
1781            GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4); 
1782            GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4); 
1783        }
1784#endif /* CONFIG_PAGING_LEVELS >= 4 */
1785#endif /* CONFIG_PAGING_LEVELS >= 3 */
1786
1787#undef GUESS
1788    }
1789
1790    if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1791        return 1;
1792
1793    /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
1794     * (entries in the fixmap) where linux maps its pagetables.  Since
1795     * we expect to hit them most of the time, we start the search for
1796     * the writeable mapping by looking at the same MFN where the last
1797     * brute-force search succeeded. */
1798
1799    if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
1800    {
1801        unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
1802        mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
1803        int shtype = mfn_to_shadow_page(last_smfn)->type;
1804
1805        if ( callbacks[shtype] ) 
1806            callbacks[shtype](v, last_smfn, gmfn);
1807
1808        if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
1809            perfc_incr(shadow_writeable_h_5);
1810    }
1811
1812    if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
1813        return 1;
1814
1815#endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
1816   
1817    /* Brute-force search of all the shadows, by walking the hash */
1818    perfc_incr(shadow_writeable_bf);
1819    hash_foreach(v, callback_mask, callbacks, gmfn);
1820
1821    /* If that didn't catch the mapping, something is very wrong */
1822    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
1823    {
1824        SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
1825                      "%lu left\n", mfn_x(gmfn),
1826                      (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
1827        domain_crash(v->domain);
1828    }
1829   
1830    /* We killed at least one writeable mapping, so must flush TLBs. */
1831    return 1;
1832}
1833
1834
1835
1836/**************************************************************************/
1837/* Remove all mappings of a guest frame from the shadow tables.
1838 * Returns non-zero if we need to flush TLBs. */
1839
1840int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
1841{
1842    struct page_info *page = mfn_to_page(gmfn);
1843    int expected_count, do_locking;
1844
1845    /* Dispatch table for getting per-type functions */
1846    static hash_callback_t callbacks[SH_type_unused] = {
1847        NULL, /* none    */
1848#if CONFIG_PAGING_LEVELS == 2
1849        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* l1_32   */
1850        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,2,2), /* fl1_32  */
1851#else
1852        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* l1_32   */
1853        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,2), /* fl1_32  */
1854#endif
1855        NULL, /* l2_32   */
1856#if CONFIG_PAGING_LEVELS >= 3
1857        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* l1_pae  */
1858        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,3,3), /* fl1_pae */
1859#else
1860        NULL, /* l1_pae  */
1861        NULL, /* fl1_pae */
1862#endif
1863        NULL, /* l2_pae  */
1864        NULL, /* l2h_pae */
1865#if CONFIG_PAGING_LEVELS >= 4
1866        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* l1_64   */
1867        SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1,4,4), /* fl1_64  */
1868#else
1869        NULL, /* l1_64   */
1870        NULL, /* fl1_64  */
1871#endif
1872        NULL, /* l2_64   */
1873        NULL, /* l2h_64  */
1874        NULL, /* l3_64   */
1875        NULL, /* l4_64   */
1876        NULL, /* p2m     */
1877        NULL  /* unused  */
1878    };
1879
1880    static unsigned int callback_mask = 
1881          1 << SH_type_l1_32_shadow
1882        | 1 << SH_type_fl1_32_shadow
1883        | 1 << SH_type_l1_pae_shadow
1884        | 1 << SH_type_fl1_pae_shadow
1885        | 1 << SH_type_l1_64_shadow
1886        | 1 << SH_type_fl1_64_shadow
1887        ;
1888
1889    perfc_incr(shadow_mappings);
1890    if ( (page->count_info & PGC_count_mask) == 0 )
1891        return 0;
1892
1893    /* Although this is an externally visible function, we do not know
1894     * whether the shadow lock will be held when it is called (since it
1895     * can be called via put_page_type when we clear a shadow l1e).
1896     * If the lock isn't held, take it for the duration of the call. */
1897    do_locking = !shadow_locked_by_me(v->domain);
1898    if ( do_locking ) shadow_lock(v->domain);
1899
1900    /* XXX TODO:
1901     * Heuristics for finding the (probably) single mapping of this gmfn */
1902   
1903    /* Brute-force search of all the shadows, by walking the hash */
1904    perfc_incr(shadow_mappings_bf);
1905    hash_foreach(v, callback_mask, callbacks, gmfn);
1906
1907    /* If that didn't catch the mapping, something is very wrong */
1908    expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
1909    if ( (page->count_info & PGC_count_mask) != expected_count )
1910    {
1911        /* Don't complain if we're in HVM and there are some extra mappings:
1912         * The qemu helper process has an untyped mapping of this dom's RAM
1913         * and the HVM restore program takes another. */
1914        if ( !(shadow_mode_external(v->domain)
1915               && (page->count_info & PGC_count_mask) <= 3
1916               && (page->u.inuse.type_info & PGT_count_mask) == 0) )
1917        {
1918            SHADOW_ERROR("can't find all mappings of mfn %lx: "
1919                          "c=%08x t=%08lx\n", mfn_x(gmfn), 
1920                          page->count_info, page->u.inuse.type_info);
1921        }
1922    }
1923
1924    if ( do_locking ) shadow_unlock(v->domain);
1925
1926    /* We killed at least one mapping, so must flush TLBs. */
1927    return 1;
1928}
1929
1930
1931/**************************************************************************/
1932/* Remove all shadows of a guest frame from the shadow tables */
1933
1934static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
1935/* Follow this shadow's up-pointer, if it has one, and remove the reference
1936 * found there.  Returns 1 if that was the only reference to this shadow */
1937{
1938    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
1939    mfn_t pmfn;
1940    void *vaddr;
1941    int rc;
1942
1943    ASSERT(sp->type > 0);
1944    ASSERT(sp->type < SH_type_max_shadow);
1945    ASSERT(sp->type != SH_type_l2_32_shadow);
1946    ASSERT(sp->type != SH_type_l2_pae_shadow);
1947    ASSERT(sp->type != SH_type_l2h_pae_shadow);
1948    ASSERT(sp->type != SH_type_l4_64_shadow);
1949   
1950    if (sp->up == 0) return 0;
1951    pmfn = _mfn(sp->up >> PAGE_SHIFT);
1952    ASSERT(mfn_valid(pmfn));
1953    vaddr = sh_map_domain_page(pmfn);
1954    ASSERT(vaddr);
1955    vaddr += sp->up & (PAGE_SIZE-1);
1956    ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
1957   
1958    /* Is this the only reference to this shadow? */
1959    rc = (sp->count == 1) ? 1 : 0;
1960
1961    /* Blank the offending entry */
1962    switch (sp->type) 
1963    {
1964    case SH_type_l1_32_shadow:
1965    case SH_type_l2_32_shadow:
1966#if CONFIG_PAGING_LEVELS == 2
1967        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
1968#else
1969        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
1970#endif
1971        break;
1972#if CONFIG_PAGING_LEVELS >=3
1973    case SH_type_l1_pae_shadow:
1974    case SH_type_l2_pae_shadow:
1975    case SH_type_l2h_pae_shadow:
1976        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
1977        break;
1978#if CONFIG_PAGING_LEVELS >= 4
1979    case SH_type_l1_64_shadow:
1980    case SH_type_l2_64_shadow:
1981    case SH_type_l2h_64_shadow:
1982    case SH_type_l3_64_shadow:
1983    case SH_type_l4_64_shadow:
1984        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
1985        break;
1986#endif
1987#endif
1988    default: BUG(); /* Some wierd unknown shadow type */
1989    }
1990   
1991    sh_unmap_domain_page(vaddr);
1992    if ( rc )
1993        perfc_incr(shadow_up_pointer);
1994    else
1995        perfc_incr(shadow_unshadow_bf);
1996
1997    return rc;
1998}
1999
2000void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
2001/* Remove the shadows of this guest page. 
2002 * If fast != 0, just try the quick heuristic, which will remove
2003 * at most one reference to each shadow of the page.  Otherwise, walk
2004 * all the shadow tables looking for refs to shadows of this gmfn.
2005 * If all != 0, kill the domain if we can't find all the shadows.
2006 * (all != 0 implies fast == 0)
2007 */
2008{
2009    struct page_info *pg = mfn_to_page(gmfn);
2010    mfn_t smfn;
2011    u32 sh_flags;
2012    int do_locking;
2013    unsigned char t;
2014   
2015    /* Dispatch table for getting per-type functions: each level must
2016     * be called with the function to remove a lower-level shadow. */
2017    static hash_callback_t callbacks[SH_type_unused] = {
2018        NULL, /* none    */
2019        NULL, /* l1_32   */
2020        NULL, /* fl1_32  */
2021#if CONFIG_PAGING_LEVELS == 2
2022        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32   */
2023#else
2024        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32   */
2025#endif
2026        NULL, /* l1_pae  */
2027        NULL, /* fl1_pae */
2028#if CONFIG_PAGING_LEVELS >= 3
2029        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae  */
2030        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
2031#else
2032        NULL, /* l2_pae  */
2033        NULL, /* l2h_pae */
2034#endif
2035        NULL, /* l1_64   */
2036        NULL, /* fl1_64  */
2037#if CONFIG_PAGING_LEVELS >= 4
2038        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64   */
2039        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2h_64  */
2040        SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64   */
2041        SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64   */
2042#else
2043        NULL, /* l2_64   */
2044        NULL, /* l2h_64  */
2045        NULL, /* l3_64   */
2046        NULL, /* l4_64   */
2047#endif
2048        NULL, /* p2m     */
2049        NULL  /* unused  */
2050    };
2051
2052    /* Another lookup table, for choosing which mask to use */
2053    static unsigned int masks[SH_type_unused] = {
2054        0, /* none    */
2055        1 << SH_type_l2_32_shadow, /* l1_32   */
2056        0, /* fl1_32  */
2057        0, /* l2_32   */
2058        ((1 << SH_type_l2h_pae_shadow)
2059         | (1 << SH_type_l2_pae_shadow)), /* l1_pae  */
2060        0, /* fl1_pae */
2061        0, /* l2_pae  */
2062        0, /* l2h_pae  */
2063        ((1 << SH_type_l2h_64_shadow)
2064         | (1 << SH_type_l2_64_shadow)),  /* l1_64   */
2065        0, /* fl1_64  */
2066        1 << SH_type_l3_64_shadow, /* l2_64   */
2067        1 << SH_type_l3_64_shadow, /* l2h_64  */
2068        1 << SH_type_l4_64_shadow, /* l3_64   */
2069        0, /* l4_64   */
2070        0, /* p2m     */
2071        0  /* unused  */
2072    };
2073
2074    ASSERT(!(all && fast));
2075
2076    /* Although this is an externally visible function, we do not know
2077     * whether the shadow lock will be held when it is called (since it
2078     * can be called via put_page_type when we clear a shadow l1e).
2079     * If the lock isn't held, take it for the duration of the call. */
2080    do_locking = !shadow_locked_by_me(v->domain);
2081    if ( do_locking ) shadow_lock(v->domain);
2082
2083    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
2084                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
2085
2086    /* Bail out now if the page is not shadowed */
2087    if ( (pg->count_info & PGC_page_table) == 0 )
2088    {
2089        if ( do_locking ) shadow_unlock(v->domain);
2090        return;
2091    }
2092
2093    /* Search for this shadow in all appropriate shadows */
2094    perfc_incr(shadow_unshadow);
2095    sh_flags = pg->shadow_flags;
2096
2097    /* Lower-level shadows need to be excised from upper-level shadows.
2098     * This call to hash_foreach() looks dangerous but is in fact OK: each
2099     * call will remove at most one shadow, and terminate immediately when
2100     * it does remove it, so we never walk the hash after doing a deletion.  */
2101#define DO_UNSHADOW(_type) do {                                 \
2102    t = (_type);                                                \
2103    smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);               \
2104    if ( unlikely(!mfn_valid(smfn)) )                           \
2105    {                                                           \
2106        SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32          \
2107                     " but no type-0x%"PRIx32" shadow\n",       \
2108                     mfn_x(gmfn), sh_flags, t);                 \
2109        break;                                                  \
2110    }                                                           \
2111    if ( sh_type_is_pinnable(v, t) )                            \
2112        sh_unpin(v, smfn);                                      \
2113    else                                                        \
2114        sh_remove_shadow_via_pointer(v, smfn);                  \
2115    if ( (pg->count_info & PGC_page_table) && !fast )           \
2116        hash_foreach(v, masks[t], callbacks, smfn);             \
2117} while (0)
2118
2119    if ( sh_flags & SHF_L1_32 )   DO_UNSHADOW(SH_type_l1_32_shadow);
2120    if ( sh_flags & SHF_L2_32 )   DO_UNSHADOW(SH_type_l2_32_shadow);
2121#if CONFIG_PAGING_LEVELS >= 3
2122    if ( sh_flags & SHF_L1_PAE )  DO_UNSHADOW(SH_type_l1_pae_shadow);
2123    if ( sh_flags & SHF_L2_PAE )  DO_UNSHADOW(SH_type_l2_pae_shadow);
2124    if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
2125#if CONFIG_PAGING_LEVELS >= 4
2126    if ( sh_flags & SHF_L1_64 )   DO_UNSHADOW(SH_type_l1_64_shadow);
2127    if ( sh_flags & SHF_L2_64 )   DO_UNSHADOW(SH_type_l2_64_shadow);
2128    if ( sh_flags & SHF_L2H_64 )  DO_UNSHADOW(SH_type_l2h_64_shadow);
2129    if ( sh_flags & SHF_L3_64 )   DO_UNSHADOW(SH_type_l3_64_shadow);
2130    if ( sh_flags & SHF_L4_64 )   DO_UNSHADOW(SH_type_l4_64_shadow);
2131#endif
2132#endif
2133
2134#undef DO_UNSHADOW
2135
2136    /* If that didn't catch the shadows, something is wrong */
2137    if ( !fast && (pg->count_info & PGC_page_table) )
2138    {
2139        SHADOW_ERROR("can't find all shadows of mfn %05lx "
2140                     "(shadow_flags=%08lx)\n",
2141                      mfn_x(gmfn), pg->shadow_flags);
2142        if ( all ) 
2143            domain_crash(v->domain);
2144    }
2145
2146    /* Need to flush TLBs now, so that linear maps are safe next time we
2147     * take a fault. */
2148    flush_tlb_mask(v->domain->domain_dirty_cpumask);
2149
2150    if ( do_locking ) shadow_unlock(v->domain);
2151}
2152
2153static void
2154sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
2155/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
2156 * Unshadow it, and recursively unshadow pages that reference it. */
2157{
2158    sh_remove_shadows(v, gmfn, 0, 1);
2159    /* XXX TODO:
2160     * Rework this hashtable walker to return a linked-list of all
2161     * the shadows it modified, then do breadth-first recursion
2162     * to find the way up to higher-level tables and unshadow them too.
2163     *
2164     * The current code (just tearing down each page's shadows as we
2165     * detect that it is not a pagetable) is correct, but very slow.
2166     * It means extra emulated writes and slows down removal of mappings. */
2167}
2168
2169/**************************************************************************/
2170
2171static void sh_update_paging_modes(struct vcpu *v)
2172{
2173    struct domain *d = v->domain;
2174    struct paging_mode *old_mode = v->arch.paging.mode;
2175    mfn_t old_guest_table;
2176
2177    ASSERT(shadow_locked_by_me(d));
2178
2179    // Valid transitions handled by this function:
2180    // - For PV guests:
2181    //     - after a shadow mode has been changed
2182    // - For HVM guests:
2183    //     - after a shadow mode has been changed
2184    //     - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
2185    //
2186
2187    // First, tear down any old shadow tables held by this vcpu.
2188    //
2189    if ( v->arch.paging.mode )
2190        v->arch.paging.mode->shadow.detach_old_tables(v);
2191
2192    if ( !is_hvm_domain(d) )
2193    {
2194        ///
2195        /// PV guest
2196        ///
2197#if CONFIG_PAGING_LEVELS == 4
2198        v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
2199#elif CONFIG_PAGING_LEVELS == 3
2200        v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2201#elif CONFIG_PAGING_LEVELS == 2
2202        v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2203#else
2204#error unexpected paging mode
2205#endif
2206        v->arch.paging.translate_enabled = !!shadow_mode_translate(d);
2207    }
2208    else
2209    {
2210        ///
2211        /// HVM guest
2212        ///
2213        ASSERT(shadow_mode_translate(d));
2214        ASSERT(shadow_mode_external(d));
2215
2216        v->arch.paging.translate_enabled = !!hvm_paging_enabled(v);
2217        if ( !v->arch.paging.translate_enabled )
2218        {
2219            /* Set v->arch.guest_table to use the p2m map, and choose
2220             * the appropriate shadow mode */
2221            old_guest_table = pagetable_get_mfn(v->arch.guest_table);
2222#if CONFIG_PAGING_LEVELS == 2
2223            v->arch.guest_table =
2224                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2225            v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
2226#elif CONFIG_PAGING_LEVELS == 3
2227            v->arch.guest_table =
2228                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
2229            v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2230#else /* CONFIG_PAGING_LEVELS == 4 */
2231            { 
2232                l4_pgentry_t *l4e; 
2233                /* Use the start of the first l3 table as a PAE l3 */
2234                ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
2235                l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
2236                ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
2237                v->arch.guest_table =
2238                    pagetable_from_pfn(l4e_get_pfn(l4e[0]));
2239                sh_unmap_domain_page(l4e);
2240            }
2241            v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
2242#endif
2243            /* Fix up refcounts on guest_table */
2244            get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
2245            if ( mfn_x(old_guest_table) != 0 )
2246                put_page(mfn_to_page(old_guest_table));
2247        }
2248        else
2249        {
2250#ifdef __x86_64__
2251            if ( hvm_long_mode_enabled(v) )
2252            {
2253                // long mode guest...
2254                v->arch.paging.mode =
2255                    &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
2256            }
2257            else
2258#endif
2259                if ( hvm_pae_enabled(v) )
2260                {
2261#if CONFIG_PAGING_LEVELS >= 3
2262                    // 32-bit PAE mode guest...
2263                    v->arch.paging.mode =
2264                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
2265#else
2266                    SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
2267                    domain_crash(d);
2268                    return;
2269#endif
2270                }
2271                else
2272                {
2273                    // 32-bit 2 level guest...
2274#if CONFIG_PAGING_LEVELS >= 3
2275                    v->arch.paging.mode =
2276                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
2277#else
2278                    v->arch.paging.mode =
2279                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
2280#endif
2281                }
2282        }
2283
2284        if ( pagetable_is_null(v->arch.monitor_table) )
2285        {
2286            mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2287            v->arch.monitor_table = pagetable_from_mfn(mmfn);
2288            make_cr3(v, mfn_x(mmfn));
2289            hvm_update_host_cr3(v);
2290        }
2291
2292        if ( v->arch.paging.mode != old_mode )
2293        {
2294            SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
2295                          "(was g=%u s=%u)\n",
2296                          d->domain_id, v->vcpu_id,
2297                          is_hvm_domain(d) ? !!hvm_paging_enabled(v) : 1,
2298                          v->arch.paging.mode->guest_levels,
2299                          v->arch.paging.mode->shadow.shadow_levels,
2300                          old_mode ? old_mode->guest_levels : 0,
2301                          old_mode ? old_mode->shadow.shadow_levels : 0);
2302            if ( old_mode &&
2303                 (v->arch.paging.mode->shadow.shadow_levels !=
2304                  old_mode->shadow.shadow_levels) )
2305            {
2306                /* Need to make a new monitor table for the new mode */
2307                mfn_t new_mfn, old_mfn;
2308
2309                if ( v != current && vcpu_runnable(v) ) 
2310                {
2311                    SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
2312                                 "this HVM vcpu's (d=%u v=%u) paging mode "
2313                                 "while it is running.\n",
2314                                 current->domain->domain_id, current->vcpu_id,
2315                                 v->domain->domain_id, v->vcpu_id);
2316                    /* It's not safe to do that because we can't change
2317                     * the host CR£ for a running domain */
2318                    domain_crash(v->domain);
2319                    return;
2320                }
2321
2322                old_mfn = pagetable_get_mfn(v->arch.monitor_table);
2323                v->arch.monitor_table = pagetable_null();
2324                new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
2325                v->arch.monitor_table = pagetable_from_mfn(new_mfn);
2326                SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
2327                               mfn_x(new_mfn));
2328
2329                /* Don't be running on the old monitor table when we
2330                 * pull it down!  Switch CR3, and warn the HVM code that
2331                 * its host cr3 has changed. */
2332                make_cr3(v, mfn_x(new_mfn));
2333                if ( v == current )
2334                    write_ptbase(v);
2335                hvm_update_host_cr3(v);
2336                old_mode->shadow.destroy_monitor_table(v, old_mfn);
2337            }
2338        }
2339
2340        // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
2341        //        These are HARD: think about the case where two CPU's have
2342        //        different values for CR4.PSE and CR4.PGE at the same time.
2343        //        This *does* happen, at least for CR4.PGE...
2344    }
2345
2346    v->arch.paging.mode->update_cr3(v, 0);
2347}
2348
2349void shadow_update_paging_modes(struct vcpu *v)
2350{
2351    shadow_lock(v->domain);
2352    sh_update_paging_modes(v);
2353    shadow_unlock(v->domain);
2354}
2355
2356/**************************************************************************/
2357/* Turning on and off shadow features */
2358
2359static void sh_new_mode(struct domain *d, u32 new_mode)
2360/* Inform all the vcpus that the shadow mode has been changed */
2361{
2362    struct vcpu *v;
2363
2364    ASSERT(shadow_locked_by_me(d));
2365    ASSERT(d != current->domain);
2366    d->arch.paging.mode = new_mode;
2367    for_each_vcpu(d, v)
2368        sh_update_paging_modes(v);
2369}
2370
2371int shadow_enable(struct domain *d, u32 mode)
2372/* Turn on "permanent" shadow features: external, translate, refcount.
2373 * Can only be called once on a domain, and these features cannot be
2374 * disabled.
2375 * Returns 0 for success, -errno for failure. */
2376{   
2377    unsigned int old_pages;
2378    int rv = 0;
2379
2380    mode |= PG_SH_enable;
2381
2382    domain_pause(d);
2383
2384    /* Sanity check the arguments */
2385    if ( (d == current->domain) ||
2386         shadow_mode_enabled(d) ||
2387         ((mode & PG_translate) && !(mode & PG_refcounts)) ||
2388         ((mode & PG_external) && !(mode & PG_translate)) )
2389    {
2390        rv = -EINVAL;
2391        goto out_unlocked;
2392    }
2393
2394    /* Init the shadow memory allocation if the user hasn't done so */
2395    old_pages = d->arch.paging.shadow.total_pages;
2396    if ( old_pages == 0 )
2397    {
2398        unsigned int r;
2399        shadow_lock(d);               
2400        r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
2401        shadow_unlock(d);
2402        if ( r != 0 )
2403        {
2404            sh_set_allocation(d, 0, NULL);
2405            rv = -ENOMEM;
2406            goto out_unlocked;
2407        }       
2408    }
2409
2410    /* Init the P2M table.  Must be done before we take the shadow lock
2411     * to avoid possible deadlock. */
2412    if ( mode & PG_translate )
2413    {
2414        rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
2415        if (rv != 0)
2416            goto out_unlocked;
2417    }
2418
2419    shadow_lock(d);
2420
2421    /* Sanity check again with the lock held */
2422    if ( shadow_mode_enabled(d) )
2423    {
2424        rv = -EINVAL;
2425        goto out_locked;
2426    }
2427
2428    /* Init the hash table */
2429    if ( shadow_hash_alloc(d) != 0 )
2430    {
2431        rv = -ENOMEM;
2432        goto out_locked;
2433    }
2434
2435#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2436    /* We assume we're dealing with an older 64bit linux guest until we
2437     * see the guest use more than one l4 per vcpu. */
2438    d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2439#endif
2440
2441    /* Update the bits */
2442    sh_new_mode(d, mode);
2443
2444 out_locked:
2445    shadow_unlock(d);
2446 out_unlocked:
2447    if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
2448        p2m_teardown(d);
2449    domain_unpause(d);
2450    return rv;
2451}
2452
2453void shadow_teardown(struct domain *d)
2454/* Destroy the shadow pagetables of this domain and free its shadow memory.
2455 * Should only be called for dying domains. */
2456{
2457    struct vcpu *v;
2458    mfn_t mfn;
2459    struct list_head *entry, *n;
2460    struct page_info *pg;
2461
2462    ASSERT(d->is_dying);
2463    ASSERT(d != current->domain);
2464
2465    if ( !shadow_locked_by_me(d) )
2466        shadow_lock(d); /* Keep various asserts happy */
2467
2468    if ( shadow_mode_enabled(d) )
2469    {
2470        /* Release the shadow and monitor tables held by each vcpu */
2471        for_each_vcpu(d, v)
2472        {
2473            if ( v->arch.paging.mode )
2474            {
2475                v->arch.paging.mode->shadow.detach_old_tables(v);
2476                if ( shadow_mode_external(d) )
2477                {
2478                    mfn = pagetable_get_mfn(v->arch.monitor_table);
2479                    if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
2480                        v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn);
2481                    v->arch.monitor_table = pagetable_null();
2482                }
2483            }
2484        }
2485    }
2486
2487    list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
2488    {
2489        list_del(entry);
2490        pg = list_entry(entry, struct page_info, list);
2491        shadow_free_p2m_page(d, pg);
2492    }
2493
2494    if ( d->arch.paging.shadow.total_pages != 0 )
2495    {
2496        SHADOW_PRINTK("teardown of domain %u starts."
2497                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
2498                       d->domain_id,
2499                       d->arch.paging.shadow.total_pages, 
2500                       d->arch.paging.shadow.free_pages, 
2501                       d->arch.paging.shadow.p2m_pages);
2502        /* Destroy all the shadows and release memory to domheap */
2503        sh_set_allocation(d, 0, NULL);
2504        /* Release the hash table back to xenheap */
2505        if (d->arch.paging.shadow.hash_table) 
2506            shadow_hash_teardown(d);
2507        /* Release the log-dirty bitmap of dirtied pages */
2508        sh_free_log_dirty_bitmap(d);
2509        /* Should not have any more memory held */
2510        SHADOW_PRINTK("teardown done."
2511                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
2512                       d->arch.paging.shadow.total_pages, 
2513                       d->arch.paging.shadow.free_pages, 
2514                       d->arch.paging.shadow.p2m_pages);
2515        ASSERT(d->arch.paging.shadow.total_pages == 0);
2516    }
2517
2518    /* We leave the "permanent" shadow modes enabled, but clear the
2519     * log-dirty mode bit.  We don't want any more mark_dirty()
2520     * calls now that we've torn down the bitmap */
2521    d->arch.paging.mode &= ~PG_log_dirty;
2522
2523    shadow_unlock(d);
2524}
2525
2526void shadow_final_teardown(struct domain *d)
2527/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
2528{
2529    SHADOW_PRINTK("dom %u final teardown starts."
2530                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
2531                   d->domain_id,
2532                   d->arch.paging.shadow.total_pages, 
2533                   d->arch.paging.shadow.free_pages, 
2534                   d->arch.paging.shadow.p2m_pages);
2535
2536    /* Double-check that the domain didn't have any shadow memory. 
2537     * It is possible for a domain that never got domain_kill()ed
2538     * to get here with its shadow allocation intact. */
2539    if ( d->arch.paging.shadow.total_pages != 0 )
2540        shadow_teardown(d);
2541
2542    /* It is now safe to pull down the p2m map. */
2543    p2m_teardown(d);
2544
2545    SHADOW_PRINTK("dom %u final teardown done."
2546                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
2547                   d->domain_id,
2548                   d->arch.paging.shadow.total_pages, 
2549                   d->arch.paging.shadow.free_pages, 
2550                   d->arch.paging.shadow.p2m_pages);
2551}
2552
2553static int shadow_one_bit_enable(struct domain *d, u32 mode)
2554/* Turn on a single shadow mode feature */
2555{
2556    ASSERT(shadow_locked_by_me(d));
2557
2558    /* Sanity check the call */
2559    if ( d == current->domain || (d->arch.paging.mode & mode) == mode )
2560    {
2561        return -EINVAL;
2562    }
2563
2564    mode |= PG_SH_enable;
2565
2566    if ( d->arch.paging.mode == 0 )
2567    {
2568        /* Init the shadow memory allocation and the hash table */
2569        if ( sh_set_allocation(d, 1, NULL) != 0 
2570             || shadow_hash_alloc(d) != 0 )
2571        {
2572            sh_set_allocation(d, 0, NULL);
2573            return -ENOMEM;
2574        }
2575    }
2576
2577    /* Update the bits */
2578    sh_new_mode(d, d->arch.paging.mode | mode);
2579
2580    return 0;
2581}
2582
2583static int shadow_one_bit_disable(struct domain *d, u32 mode) 
2584/* Turn off a single shadow mode feature */
2585{
2586    struct vcpu *v;
2587    ASSERT(shadow_locked_by_me(d));
2588
2589    /* Sanity check the call */
2590    if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) )
2591    {
2592        return -EINVAL;
2593    }
2594
2595    /* Update the bits */
2596    sh_new_mode(d, d->arch.paging.mode & ~mode);
2597    if ( d->arch.paging.mode == 0 )
2598    {
2599        /* Get this domain off shadows */
2600        SHADOW_PRINTK("un-shadowing of domain %u starts."
2601                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
2602                       d->domain_id,
2603                       d->arch.paging.shadow.total_pages, 
2604                       d->arch.paging.shadow.free_pages, 
2605                       d->arch.paging.shadow.p2m_pages);
2606        for_each_vcpu(d, v)
2607        {
2608            if ( v->arch.paging.mode )
2609                v->arch.paging.mode->shadow.detach_old_tables(v);
2610#if CONFIG_PAGING_LEVELS == 4
2611            if ( !(v->arch.flags & TF_kernel_mode) )
2612                make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
2613            else
2614#endif
2615                make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
2616
2617        }
2618
2619        /* Pull down the memory allocation */
2620        if ( sh_set_allocation(d, 0, NULL) != 0 )
2621        {
2622            // XXX - How can this occur?
2623            //       Seems like a bug to return an error now that we've
2624            //       disabled the relevant shadow mode.
2625            //
2626            return -ENOMEM;
2627        }
2628        shadow_hash_teardown(d);
2629        SHADOW_PRINTK("un-shadowing of domain %u done."
2630                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
2631                       d->domain_id,
2632                       d->arch.paging.shadow.total_pages, 
2633                       d->arch.paging.shadow.free_pages, 
2634                       d->arch.paging.shadow.p2m_pages);
2635    }
2636
2637    return 0;
2638}
2639
2640/* Enable/disable ops for the "test" and "log-dirty" modes */
2641static int shadow_test_enable(struct domain *d)
2642{
2643    int ret;
2644
2645    domain_pause(d);
2646    shadow_lock(d);
2647    ret = shadow_one_bit_enable(d, PG_SH_enable);
2648    shadow_unlock(d);
2649    domain_unpause(d);
2650
2651    return ret;
2652}
2653
2654static int shadow_test_disable(struct domain *d)
2655{
2656    int ret;
2657
2658    domain_pause(d);
2659    shadow_lock(d);
2660    ret = shadow_one_bit_disable(d, PG_SH_enable);
2661    shadow_unlock(d);
2662    domain_unpause(d);
2663
2664    return ret;
2665}
2666
2667static int
2668sh_alloc_log_dirty_bitmap(struct domain *d)
2669{
2670    ASSERT(d->arch.paging.shadow.dirty_bitmap == NULL);
2671    d->arch.paging.shadow.dirty_bitmap_size =
2672        (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
2673    d->arch.paging.shadow.dirty_bitmap =
2674        xmalloc_array(unsigned long,
2675                      d->arch.paging.shadow.dirty_bitmap_size / BITS_PER_LONG);
2676    if ( d->arch.paging.shadow.dirty_bitmap == NULL )
2677    {
2678        d->arch.paging.shadow.dirty_bitmap_size = 0;
2679        return -ENOMEM;
2680    }
2681    memset(d->arch.paging.shadow.dirty_bitmap, 0,
2682           d->arch.paging.shadow.dirty_bitmap_size/8);
2683
2684    return 0;
2685}
2686
2687static void
2688sh_free_log_dirty_bitmap(struct domain *d)
2689{
2690    d->arch.paging.shadow.dirty_bitmap_size = 0;
2691    if ( d->arch.paging.shadow.dirty_bitmap )
2692    {
2693        xfree(d->arch.paging.shadow.dirty_bitmap);
2694        d->arch.paging.shadow.dirty_bitmap = NULL;
2695    }
2696}
2697
2698static int shadow_log_dirty_enable(struct domain *d)
2699{
2700    int ret;
2701
2702    domain_pause(d);
2703    shadow_lock(d);
2704
2705    if ( shadow_mode_log_dirty(d) )
2706    {
2707        ret = -EINVAL;
2708        goto out;
2709    }
2710
2711    if ( shadow_mode_enabled(d) )
2712    {
2713        /* This domain already has some shadows: need to clear them out
2714         * of the way to make sure that all references to guest memory are
2715         * properly write-protected */
2716        shadow_blow_tables(d);
2717    }
2718
2719#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
2720    /* 32bit PV guests on 64bit xen behave like older 64bit linux: they
2721     * change an l4e instead of cr3 to switch tables.  Give them the
2722     * same optimization */
2723    if ( is_pv_32on64_domain(d) )
2724        d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
2725#endif
2726
2727    ret = sh_alloc_log_dirty_bitmap(d);
2728    if ( ret != 0 )
2729    {
2730        sh_free_log_dirty_bitmap(d);
2731        goto out;
2732    }
2733
2734    ret = shadow_one_bit_enable(d, PG_log_dirty);
2735    if ( ret != 0 )
2736        sh_free_log_dirty_bitmap(d);
2737
2738 out:
2739    shadow_unlock(d);
2740    domain_unpause(d);
2741    return ret;
2742}
2743
2744static int shadow_log_dirty_disable(struct domain *d)
2745{
2746    int ret;
2747
2748    domain_pause(d);
2749    shadow_lock(d);
2750    ret = shadow_one_bit_disable(d, PG_log_dirty);
2751    if ( !shadow_mode_log_dirty(d) )
2752        sh_free_log_dirty_bitmap(d);
2753    shadow_unlock(d);
2754    domain_unpause(d);
2755
2756    return ret;
2757}
2758
2759/**************************************************************************/
2760/* P2M map manipulations */
2761
2762/* shadow specific code which should be called when P2M table entry is updated
2763 * with new content. It is responsible for update the entry, as well as other
2764 * shadow processing jobs.
2765 */
2766void
2767shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, 
2768                       l1_pgentry_t new, unsigned int level)
2769{
2770    struct domain *d = v->domain;
2771    mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
2772    mfn_t mfn;
2773   
2774    shadow_lock(d);
2775
2776    /* handle physmap_add and physmap_remove */
2777    mfn = gfn_to_mfn(d, gfn);
2778    if ( v != NULL && level == 1 && mfn_valid(mfn) ) {
2779        sh_remove_all_shadows_and_parents(v, mfn);
2780        if ( sh_remove_all_mappings(v, mfn) )
2781            flush_tlb_mask(d->domain_dirty_cpumask);   
2782    }
2783   
2784    /* update the entry with new content */
2785    safe_write_pte(p, new);
2786
2787    /* The P2M can be shadowed: keep the shadows synced */
2788    if ( d->vcpu[0] != NULL )
2789        (void)sh_validate_guest_entry(d->vcpu[0], table_mfn, p, sizeof(*p));
2790
2791    /* install P2M in monitors for PAE Xen */
2792#if CONFIG_PAGING_LEVELS == 3
2793    if ( level == 3 ) {
2794        struct vcpu *v;
2795        /* We have written to the p2m l3: need to sync the per-vcpu
2796         * copies of it in the monitor tables */
2797        p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
2798        /* Also, any vcpus running on shadows of the p2m need to
2799         * reload their CR3s so the change propagates to the shadow */
2800        for_each_vcpu(d, v) {
2801            if ( pagetable_get_pfn(v->arch.guest_table) 
2802                 == pagetable_get_pfn(d->arch.phys_table) 
2803                 && v->arch.paging.mode != NULL )
2804                v->arch.paging.mode->update_cr3(v, 0);
2805        }
2806    }
2807#endif
2808
2809#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
2810    /* If we're doing FAST_FAULT_PATH, then shadow mode may have
2811       cached the fact that this is an mmio region in the shadow
2812       page tables.  Blow the tables away to remove the cache.
2813       This is pretty heavy handed, but this is a rare operation
2814       (it might happen a dozen times during boot and then never
2815       again), so it doesn't matter too much. */
2816    if ( d->arch.paging.shadow.has_fast_mmio_entries )
2817    {
2818        shadow_blow_tables(d);
2819        d->arch.paging.shadow.has_fast_mmio_entries = 0;
2820    }
2821#endif
2822
2823    shadow_unlock(d);
2824}
2825
2826/**************************************************************************/
2827/* Log-dirty mode support */
2828
2829/* Convert a shadow to log-dirty mode. */
2830void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
2831{
2832    BUG();
2833}
2834
2835
2836/* Read a domain's log-dirty bitmap and stats. 
2837 * If the operation is a CLEAN, clear the bitmap and stats as well. */
2838static int shadow_log_dirty_op(
2839    struct domain *d, struct xen_domctl_shadow_op *sc)
2840{
2841    int i, rv = 0, clean = 0, peek = 1;
2842
2843    domain_pause(d);
2844    shadow_lock(d);
2845
2846    clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
2847
2848    SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", 
2849                  (clean) ? "clean" : "peek",
2850                  d->domain_id,
2851                  d->arch.paging.shadow.fault_count, 
2852                  d->arch.paging.shadow.dirty_count);
2853
2854    sc->stats.fault_count = d->arch.paging.shadow.fault_count;
2855    sc->stats.dirty_count = d->arch.paging.shadow.dirty_count;
2856
2857    if ( clean )
2858    {
2859        /* Need to revoke write access to the domain's pages again.
2860         * In future, we'll have a less heavy-handed approach to this,
2861         * but for now, we just unshadow everything except Xen. */
2862        shadow_blow_tables(d);
2863
2864        d->arch.paging.shadow.fault_count = 0;
2865        d->arch.paging.shadow.dirty_count = 0;
2866    }
2867
2868    if ( guest_handle_is_null(sc->dirty_bitmap) )
2869        /* caller may have wanted just to clean the state or access stats. */
2870        peek = 0;
2871
2872    if ( (peek || clean) && (d->arch.paging.shadow.dirty_bitmap == NULL) )
2873    {
2874        rv = -EINVAL; /* perhaps should be ENOMEM? */
2875        goto out;
2876    }
2877 
2878    if ( sc->pages > d->arch.paging.shadow.dirty_bitmap_size )
2879        sc->pages = d->arch.paging.shadow.dirty_bitmap_size;
2880
2881#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
2882    for ( i = 0; i < sc->pages; i += CHUNK )
2883    {
2884        int bytes = ((((sc->pages - i) > CHUNK)
2885                      ? CHUNK
2886                      : (sc->pages - i)) + 7) / 8;
2887
2888        if ( likely(peek) )
2889        {
2890            if ( copy_to_guest_offset(
2891                sc->dirty_bitmap, i/8,
2892                (uint8_t *)d->arch.paging.shadow.dirty_bitmap + (i/8), bytes) )
2893            {
2894                rv = -EFAULT;
2895                goto out;
2896            }
2897        }
2898
2899        if ( clean )
2900            memset((uint8_t *)d->arch.paging.shadow.dirty_bitmap + (i/8), 0, bytes);
2901    }
2902#undef CHUNK
2903
2904 out:
2905    shadow_unlock(d);
2906    domain_unpause(d);
2907    return rv;
2908}
2909
2910
2911/* Mark a page as dirty */
2912void sh_mark_dirty(struct domain *d, mfn_t gmfn)
2913{
2914    unsigned long pfn;
2915    int do_locking;
2916
2917    if ( !shadow_mode_log_dirty(d) || !mfn_valid(gmfn) )
2918        return;
2919
2920    /* Although this is an externally visible function, we do not know
2921     * whether the shadow lock will be held when it is called (since it
2922     * can be called from __hvm_copy during emulation).
2923     * If the lock isn't held, take it for the duration of the call. */
2924    do_locking = !shadow_locked_by_me(d);
2925    if ( do_locking ) 
2926    { 
2927        shadow_lock(d);
2928        /* Check the mode again with the lock held */ 
2929        if ( unlikely(!shadow_mode_log_dirty(d)) )
2930        {
2931            shadow_unlock(d);
2932            return;
2933        }
2934    }
2935
2936    ASSERT(d->arch.paging.shadow.dirty_bitmap != NULL);
2937
2938    /* We /really/ mean PFN here, even for non-translated guests. */
2939    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
2940
2941    /*
2942     * Values with the MSB set denote MFNs that aren't really part of the
2943     * domain's pseudo-physical memory map (e.g., the shared info frame).
2944     * Nothing to do here...
2945     */
2946    if ( unlikely(!VALID_M2P(pfn)) )
2947        return;
2948
2949    /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
2950    if ( likely(pfn < d->arch.paging.shadow.dirty_bitmap_size) ) 
2951    { 
2952        if ( !__test_and_set_bit(pfn, d->arch.paging.shadow.dirty_bitmap) )
2953        {
2954            SHADOW_DEBUG(LOGDIRTY, 
2955                          "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n",
2956                          mfn_x(gmfn), pfn, d->domain_id);
2957            d->arch.paging.shadow.dirty_count++;
2958        }
2959    }
2960    else
2961    {
2962        SHADOW_PRINTK("mark_dirty OOR! "
2963                       "mfn=%" PRI_mfn " pfn=%lx max=%x (dom %d)\n"
2964                       "owner=%d c=%08x t=%" PRtype_info "\n",
2965                       mfn_x(gmfn), 
2966                       pfn, 
2967                       d->arch.paging.shadow.dirty_bitmap_size,
2968                       d->domain_id,
2969                       (page_get_owner(mfn_to_page(gmfn))
2970                        ? page_get_owner(mfn_to_page(gmfn))->domain_id
2971                        : -1),
2972                       mfn_to_page(gmfn)->count_info, 
2973                       mfn_to_page(gmfn)->u.inuse.type_info);
2974    }
2975
2976    if ( do_locking ) shadow_unlock(d);
2977}
2978
2979/**************************************************************************/
2980/* Shadow-control XEN_DOMCTL dispatcher */
2981
2982int shadow_domctl(struct domain *d, 
2983                  xen_domctl_shadow_op_t *sc,
2984                  XEN_GUEST_HANDLE(void) u_domctl)
2985{
2986    int rc, preempted = 0;
2987
2988    if ( unlikely(d == current->domain) )
2989    {
2990        gdprintk(XENLOG_INFO, "Dom %u tried to do a shadow op on itself.\n",
2991                 d->domain_id);
2992        return -EINVAL;
2993    }
2994
2995    if ( unlikely(d->is_dying) )
2996    {
2997        gdprintk(XENLOG_INFO, "Ignoring shadow op on dying domain %u\n",
2998                 d->domain_id);
2999        return 0;
3000    }
3001
3002    if ( unlikely(d->vcpu[0] == NULL) )
3003    {
3004        SHADOW_ERROR("Shadow op on a domain (%u) with no vcpus\n",
3005                     d->domain_id);
3006        return -EINVAL;
3007    }
3008
3009    switch ( sc->op )
3010    {
3011    case XEN_DOMCTL_SHADOW_OP_OFF:
3012        if ( shadow_mode_log_dirty(d) )
3013            if ( (rc = shadow_log_dirty_disable(d)) != 0 ) 
3014                return rc;
3015        if ( d->arch.paging.mode == PG_SH_enable )
3016            if ( (rc = shadow_test_disable(d)) != 0 ) 
3017                return rc;
3018        return 0;
3019
3020    case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
3021        return shadow_test_enable(d);
3022
3023    case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
3024        return shadow_log_dirty_enable(d);
3025
3026    case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
3027        return shadow_enable(d, PG_refcounts|PG_translate);
3028
3029    case XEN_DOMCTL_SHADOW_OP_CLEAN:
3030    case XEN_DOMCTL_SHADOW_OP_PEEK:
3031        return shadow_log_dirty_op(d, sc);
3032
3033    case XEN_DOMCTL_SHADOW_OP_ENABLE:
3034        if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
3035            return shadow_log_dirty_enable(d);
3036        return shadow_enable(d, sc->mode << PG_mode_shift);
3037
3038    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
3039        sc->mb = shadow_get_allocation(d);
3040        return 0;
3041
3042    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
3043        shadow_lock(d);
3044        if ( sc->mb == 0 && shadow_mode_enabled(d) )
3045        {           
3046            /* Can't set the allocation to zero unless the domain stops using
3047             * shadow pagetables first */
3048            SHADOW_ERROR("Can't set shadow allocation to zero, domain %u"
3049                         " is still using shadows.\n", d->domain_id);
3050            shadow_unlock(d);
3051            return -EINVAL;
3052        }
3053        rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
3054        shadow_unlock(d);
3055        if ( preempted )
3056            /* Not finished.  Set up to re-run the call. */
3057            rc = hypercall_create_continuation(
3058                __HYPERVISOR_domctl, "h", u_domctl);
3059        else 
3060            /* Finished.  Return the new allocation */
3061            sc->mb = shadow_get_allocation(d);
3062        return rc;
3063
3064    default:
3065        SHADOW_ERROR("Bad shadow op %u\n", sc->op);
3066        return -EINVAL;
3067    }
3068}
3069
3070
3071/**************************************************************************/
3072/* Auditing shadow tables */
3073
3074#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
3075
3076void shadow_audit_tables(struct vcpu *v) 
3077{
3078    /* Dispatch table for getting per-type functions */
3079    static hash_callback_t callbacks[SH_type_unused] = {
3080        NULL, /* none    */
3081#if CONFIG_PAGING_LEVELS == 2
3082        SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2),  /* l1_32   */
3083        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32  */
3084        SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2),  /* l2_32   */
3085#else
3086        SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2),  /* l1_32   */
3087        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32  */
3088        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2),  /* l2_32   */
3089        SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3),  /* l1_pae  */
3090        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
3091        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3),  /* l2_pae  */
3092        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3),  /* l2h_pae */
3093#if CONFIG_PAGING_LEVELS >= 4
3094        SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4),  /* l1_64   */
3095        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64  */
3096        SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4),  /* l2_64   */
3097        SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4),  /* l2h_64   */
3098        SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4),  /* l3_64   */
3099        SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4),  /* l4_64   */
3100#endif /* CONFIG_PAGING_LEVELS >= 4 */
3101#endif /* CONFIG_PAGING_LEVELS > 2 */
3102        NULL  /* All the rest */
3103    };
3104    unsigned int mask; 
3105
3106    if ( !(SHADOW_AUDIT_ENABLE) )
3107        return;
3108   
3109    if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
3110        mask = ~1; /* Audit every table in the system */
3111    else 
3112    {
3113        /* Audit only the current mode's tables */
3114        switch ( v->arch.paging.mode->guest_levels )
3115        {
3116        case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
3117        case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
3118                        |SHF_L2H_PAE); break;
3119        case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64 
3120                        |SHF_L3_64|SHF_L4_64); break;
3121        default: BUG();
3122        }
3123    }
3124
3125    hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
3126}
3127
3128#endif /* Shadow audit */
3129
3130/*
3131 * Local variables:
3132 * mode: C
3133 * c-set-style: "BSD"
3134 * c-basic-offset: 4
3135 * indent-tabs-mode: nil
3136 * End:
3137 */
Note: See TracBrowser for help on using the repository browser.