KVM: Fix x86 emulator writeback
[powerpc.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <linux/magic.h>
24 #include <asm/processor.h>
25 #include <linux/percpu.h>
26 #include <linux/gfp.h>
27 #include <asm/msr.h>
28 #include <linux/mm.h>
29 #include <linux/miscdevice.h>
30 #include <linux/vmalloc.h>
31 #include <asm/uaccess.h>
32 #include <linux/reboot.h>
33 #include <asm/io.h>
34 #include <linux/debugfs.h>
35 #include <linux/highmem.h>
36 #include <linux/file.h>
37 #include <asm/desc.h>
38 #include <linux/sysdev.h>
39 #include <linux/cpu.h>
40 #include <linux/file.h>
41 #include <linux/fs.h>
42 #include <linux/mount.h>
43 #include <linux/sched.h>
44 #include <linux/cpumask.h>
45 #include <linux/smp.h>
46
47 #include "x86_emulate.h"
48 #include "segment_descriptor.h"
49
50 MODULE_AUTHOR("Qumranet");
51 MODULE_LICENSE("GPL");
52
53 static DEFINE_SPINLOCK(kvm_lock);
54 static LIST_HEAD(vm_list);
55
56 struct kvm_arch_ops *kvm_arch_ops;
57
58 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
59
60 static struct kvm_stats_debugfs_item {
61         const char *name;
62         int offset;
63         struct dentry *dentry;
64 } debugfs_entries[] = {
65         { "pf_fixed", STAT_OFFSET(pf_fixed) },
66         { "pf_guest", STAT_OFFSET(pf_guest) },
67         { "tlb_flush", STAT_OFFSET(tlb_flush) },
68         { "invlpg", STAT_OFFSET(invlpg) },
69         { "exits", STAT_OFFSET(exits) },
70         { "io_exits", STAT_OFFSET(io_exits) },
71         { "mmio_exits", STAT_OFFSET(mmio_exits) },
72         { "signal_exits", STAT_OFFSET(signal_exits) },
73         { "irq_window", STAT_OFFSET(irq_window_exits) },
74         { "halt_exits", STAT_OFFSET(halt_exits) },
75         { "request_irq", STAT_OFFSET(request_irq_exits) },
76         { "irq_exits", STAT_OFFSET(irq_exits) },
77         { "light_exits", STAT_OFFSET(light_exits) },
78         { "efer_reload", STAT_OFFSET(efer_reload) },
79         { NULL }
80 };
81
82 static struct dentry *debugfs_dir;
83
84 struct vfsmount *kvmfs_mnt;
85
86 #define MAX_IO_MSRS 256
87
88 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
89 #define LMSW_GUEST_MASK 0x0eULL
90 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
91 #define CR8_RESEVED_BITS (~0x0fULL)
92 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
93
94 #ifdef CONFIG_X86_64
95 // LDT or TSS descriptor in the GDT. 16 bytes.
96 struct segment_descriptor_64 {
97         struct segment_descriptor s;
98         u32 base_higher;
99         u32 pad_zero;
100 };
101
102 #endif
103
104 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
105                            unsigned long arg);
106
107 static struct inode *kvmfs_inode(struct file_operations *fops)
108 {
109         int error = -ENOMEM;
110         struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
111
112         if (!inode)
113                 goto eexit_1;
114
115         inode->i_fop = fops;
116
117         /*
118          * Mark the inode dirty from the very beginning,
119          * that way it will never be moved to the dirty
120          * list because mark_inode_dirty() will think
121          * that it already _is_ on the dirty list.
122          */
123         inode->i_state = I_DIRTY;
124         inode->i_mode = S_IRUSR | S_IWUSR;
125         inode->i_uid = current->fsuid;
126         inode->i_gid = current->fsgid;
127         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
128         return inode;
129
130 eexit_1:
131         return ERR_PTR(error);
132 }
133
134 static struct file *kvmfs_file(struct inode *inode, void *private_data)
135 {
136         struct file *file = get_empty_filp();
137
138         if (!file)
139                 return ERR_PTR(-ENFILE);
140
141         file->f_path.mnt = mntget(kvmfs_mnt);
142         file->f_path.dentry = d_alloc_anon(inode);
143         if (!file->f_path.dentry)
144                 return ERR_PTR(-ENOMEM);
145         file->f_mapping = inode->i_mapping;
146
147         file->f_pos = 0;
148         file->f_flags = O_RDWR;
149         file->f_op = inode->i_fop;
150         file->f_mode = FMODE_READ | FMODE_WRITE;
151         file->f_version = 0;
152         file->private_data = private_data;
153         return file;
154 }
155
156 unsigned long segment_base(u16 selector)
157 {
158         struct descriptor_table gdt;
159         struct segment_descriptor *d;
160         unsigned long table_base;
161         typedef unsigned long ul;
162         unsigned long v;
163
164         if (selector == 0)
165                 return 0;
166
167         asm ("sgdt %0" : "=m"(gdt));
168         table_base = gdt.base;
169
170         if (selector & 4) {           /* from ldt */
171                 u16 ldt_selector;
172
173                 asm ("sldt %0" : "=g"(ldt_selector));
174                 table_base = segment_base(ldt_selector);
175         }
176         d = (struct segment_descriptor *)(table_base + (selector & ~7));
177         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
178 #ifdef CONFIG_X86_64
179         if (d->system == 0
180             && (d->type == 2 || d->type == 9 || d->type == 11))
181                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
182 #endif
183         return v;
184 }
185 EXPORT_SYMBOL_GPL(segment_base);
186
187 static inline int valid_vcpu(int n)
188 {
189         return likely(n >= 0 && n < KVM_MAX_VCPUS);
190 }
191
192 int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
193                    void *dest)
194 {
195         unsigned char *host_buf = dest;
196         unsigned long req_size = size;
197
198         while (size) {
199                 hpa_t paddr;
200                 unsigned now;
201                 unsigned offset;
202                 hva_t guest_buf;
203
204                 paddr = gva_to_hpa(vcpu, addr);
205
206                 if (is_error_hpa(paddr))
207                         break;
208
209                 guest_buf = (hva_t)kmap_atomic(
210                                         pfn_to_page(paddr >> PAGE_SHIFT),
211                                         KM_USER0);
212                 offset = addr & ~PAGE_MASK;
213                 guest_buf |= offset;
214                 now = min(size, PAGE_SIZE - offset);
215                 memcpy(host_buf, (void*)guest_buf, now);
216                 host_buf += now;
217                 addr += now;
218                 size -= now;
219                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
220         }
221         return req_size - size;
222 }
223 EXPORT_SYMBOL_GPL(kvm_read_guest);
224
225 int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
226                     void *data)
227 {
228         unsigned char *host_buf = data;
229         unsigned long req_size = size;
230
231         while (size) {
232                 hpa_t paddr;
233                 unsigned now;
234                 unsigned offset;
235                 hva_t guest_buf;
236                 gfn_t gfn;
237
238                 paddr = gva_to_hpa(vcpu, addr);
239
240                 if (is_error_hpa(paddr))
241                         break;
242
243                 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
244                 mark_page_dirty(vcpu->kvm, gfn);
245                 guest_buf = (hva_t)kmap_atomic(
246                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
247                 offset = addr & ~PAGE_MASK;
248                 guest_buf |= offset;
249                 now = min(size, PAGE_SIZE - offset);
250                 memcpy((void*)guest_buf, host_buf, now);
251                 host_buf += now;
252                 addr += now;
253                 size -= now;
254                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
255         }
256         return req_size - size;
257 }
258 EXPORT_SYMBOL_GPL(kvm_write_guest);
259
260 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
261 {
262         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
263                 return;
264
265         vcpu->guest_fpu_loaded = 1;
266         fx_save(vcpu->host_fx_image);
267         fx_restore(vcpu->guest_fx_image);
268 }
269 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
270
271 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
272 {
273         if (!vcpu->guest_fpu_loaded)
274                 return;
275
276         vcpu->guest_fpu_loaded = 0;
277         fx_save(vcpu->guest_fx_image);
278         fx_restore(vcpu->host_fx_image);
279 }
280 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
281
282 /*
283  * Switches to specified vcpu, until a matching vcpu_put()
284  */
285 static void vcpu_load(struct kvm_vcpu *vcpu)
286 {
287         mutex_lock(&vcpu->mutex);
288         kvm_arch_ops->vcpu_load(vcpu);
289 }
290
291 /*
292  * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL
293  * if the slot is not populated.
294  */
295 static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot)
296 {
297         struct kvm_vcpu *vcpu = &kvm->vcpus[slot];
298
299         mutex_lock(&vcpu->mutex);
300         if (!vcpu->vmcs) {
301                 mutex_unlock(&vcpu->mutex);
302                 return NULL;
303         }
304         kvm_arch_ops->vcpu_load(vcpu);
305         return vcpu;
306 }
307
308 static void vcpu_put(struct kvm_vcpu *vcpu)
309 {
310         kvm_arch_ops->vcpu_put(vcpu);
311         mutex_unlock(&vcpu->mutex);
312 }
313
314 static void ack_flush(void *_completed)
315 {
316         atomic_t *completed = _completed;
317
318         atomic_inc(completed);
319 }
320
321 void kvm_flush_remote_tlbs(struct kvm *kvm)
322 {
323         int i, cpu, needed;
324         cpumask_t cpus;
325         struct kvm_vcpu *vcpu;
326         atomic_t completed;
327
328         atomic_set(&completed, 0);
329         cpus_clear(cpus);
330         needed = 0;
331         for (i = 0; i < kvm->nvcpus; ++i) {
332                 vcpu = &kvm->vcpus[i];
333                 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
334                         continue;
335                 cpu = vcpu->cpu;
336                 if (cpu != -1 && cpu != raw_smp_processor_id())
337                         if (!cpu_isset(cpu, cpus)) {
338                                 cpu_set(cpu, cpus);
339                                 ++needed;
340                         }
341         }
342
343         /*
344          * We really want smp_call_function_mask() here.  But that's not
345          * available, so ipi all cpus in parallel and wait for them
346          * to complete.
347          */
348         for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
349                 smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
350         while (atomic_read(&completed) != needed) {
351                 cpu_relax();
352                 barrier();
353         }
354 }
355
356 static struct kvm *kvm_create_vm(void)
357 {
358         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
359         int i;
360
361         if (!kvm)
362                 return ERR_PTR(-ENOMEM);
363
364         kvm_io_bus_init(&kvm->pio_bus);
365         spin_lock_init(&kvm->lock);
366         INIT_LIST_HEAD(&kvm->active_mmu_pages);
367         spin_lock(&kvm_lock);
368         list_add(&kvm->vm_list, &vm_list);
369         spin_unlock(&kvm_lock);
370         kvm_io_bus_init(&kvm->mmio_bus);
371         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
372                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
373
374                 mutex_init(&vcpu->mutex);
375                 vcpu->cpu = -1;
376                 vcpu->kvm = kvm;
377                 vcpu->mmu.root_hpa = INVALID_PAGE;
378         }
379         return kvm;
380 }
381
382 static int kvm_dev_open(struct inode *inode, struct file *filp)
383 {
384         return 0;
385 }
386
387 /*
388  * Free any memory in @free but not in @dont.
389  */
390 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
391                                   struct kvm_memory_slot *dont)
392 {
393         int i;
394
395         if (!dont || free->phys_mem != dont->phys_mem)
396                 if (free->phys_mem) {
397                         for (i = 0; i < free->npages; ++i)
398                                 if (free->phys_mem[i])
399                                         __free_page(free->phys_mem[i]);
400                         vfree(free->phys_mem);
401                 }
402
403         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
404                 vfree(free->dirty_bitmap);
405
406         free->phys_mem = NULL;
407         free->npages = 0;
408         free->dirty_bitmap = NULL;
409 }
410
411 static void kvm_free_physmem(struct kvm *kvm)
412 {
413         int i;
414
415         for (i = 0; i < kvm->nmemslots; ++i)
416                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
417 }
418
419 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
420 {
421         int i;
422
423         for (i = 0; i < 2; ++i)
424                 if (vcpu->pio.guest_pages[i]) {
425                         __free_page(vcpu->pio.guest_pages[i]);
426                         vcpu->pio.guest_pages[i] = NULL;
427                 }
428 }
429
430 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
431 {
432         if (!vcpu->vmcs)
433                 return;
434
435         vcpu_load(vcpu);
436         kvm_mmu_unload(vcpu);
437         vcpu_put(vcpu);
438 }
439
440 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
441 {
442         if (!vcpu->vmcs)
443                 return;
444
445         vcpu_load(vcpu);
446         kvm_mmu_destroy(vcpu);
447         vcpu_put(vcpu);
448         kvm_arch_ops->vcpu_free(vcpu);
449         free_page((unsigned long)vcpu->run);
450         vcpu->run = NULL;
451         free_page((unsigned long)vcpu->pio_data);
452         vcpu->pio_data = NULL;
453         free_pio_guest_pages(vcpu);
454 }
455
456 static void kvm_free_vcpus(struct kvm *kvm)
457 {
458         unsigned int i;
459
460         /*
461          * Unpin any mmu pages first.
462          */
463         for (i = 0; i < KVM_MAX_VCPUS; ++i)
464                 kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
465         for (i = 0; i < KVM_MAX_VCPUS; ++i)
466                 kvm_free_vcpu(&kvm->vcpus[i]);
467 }
468
469 static int kvm_dev_release(struct inode *inode, struct file *filp)
470 {
471         return 0;
472 }
473
474 static void kvm_destroy_vm(struct kvm *kvm)
475 {
476         spin_lock(&kvm_lock);
477         list_del(&kvm->vm_list);
478         spin_unlock(&kvm_lock);
479         kvm_io_bus_destroy(&kvm->pio_bus);
480         kvm_io_bus_destroy(&kvm->mmio_bus);
481         kvm_free_vcpus(kvm);
482         kvm_free_physmem(kvm);
483         kfree(kvm);
484 }
485
486 static int kvm_vm_release(struct inode *inode, struct file *filp)
487 {
488         struct kvm *kvm = filp->private_data;
489
490         kvm_destroy_vm(kvm);
491         return 0;
492 }
493
494 static void inject_gp(struct kvm_vcpu *vcpu)
495 {
496         kvm_arch_ops->inject_gp(vcpu, 0);
497 }
498
499 /*
500  * Load the pae pdptrs.  Return true is they are all valid.
501  */
502 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
503 {
504         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
505         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
506         int i;
507         u64 pdpte;
508         u64 *pdpt;
509         int ret;
510         struct page *page;
511
512         spin_lock(&vcpu->kvm->lock);
513         page = gfn_to_page(vcpu->kvm, pdpt_gfn);
514         /* FIXME: !page - emulate? 0xff? */
515         pdpt = kmap_atomic(page, KM_USER0);
516
517         ret = 1;
518         for (i = 0; i < 4; ++i) {
519                 pdpte = pdpt[offset + i];
520                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
521                         ret = 0;
522                         goto out;
523                 }
524         }
525
526         for (i = 0; i < 4; ++i)
527                 vcpu->pdptrs[i] = pdpt[offset + i];
528
529 out:
530         kunmap_atomic(pdpt, KM_USER0);
531         spin_unlock(&vcpu->kvm->lock);
532
533         return ret;
534 }
535
536 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
537 {
538         if (cr0 & CR0_RESEVED_BITS) {
539                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
540                        cr0, vcpu->cr0);
541                 inject_gp(vcpu);
542                 return;
543         }
544
545         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
546                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
547                 inject_gp(vcpu);
548                 return;
549         }
550
551         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
552                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
553                        "and a clear PE flag\n");
554                 inject_gp(vcpu);
555                 return;
556         }
557
558         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
559 #ifdef CONFIG_X86_64
560                 if ((vcpu->shadow_efer & EFER_LME)) {
561                         int cs_db, cs_l;
562
563                         if (!is_pae(vcpu)) {
564                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
565                                        "in long mode while PAE is disabled\n");
566                                 inject_gp(vcpu);
567                                 return;
568                         }
569                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
570                         if (cs_l) {
571                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
572                                        "in long mode while CS.L == 1\n");
573                                 inject_gp(vcpu);
574                                 return;
575
576                         }
577                 } else
578 #endif
579                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
580                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
581                                "reserved bits\n");
582                         inject_gp(vcpu);
583                         return;
584                 }
585
586         }
587
588         kvm_arch_ops->set_cr0(vcpu, cr0);
589         vcpu->cr0 = cr0;
590
591         spin_lock(&vcpu->kvm->lock);
592         kvm_mmu_reset_context(vcpu);
593         spin_unlock(&vcpu->kvm->lock);
594         return;
595 }
596 EXPORT_SYMBOL_GPL(set_cr0);
597
598 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
599 {
600         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
601 }
602 EXPORT_SYMBOL_GPL(lmsw);
603
604 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
605 {
606         if (cr4 & CR4_RESEVED_BITS) {
607                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
608                 inject_gp(vcpu);
609                 return;
610         }
611
612         if (is_long_mode(vcpu)) {
613                 if (!(cr4 & CR4_PAE_MASK)) {
614                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
615                                "in long mode\n");
616                         inject_gp(vcpu);
617                         return;
618                 }
619         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
620                    && !load_pdptrs(vcpu, vcpu->cr3)) {
621                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
622                 inject_gp(vcpu);
623         }
624
625         if (cr4 & CR4_VMXE_MASK) {
626                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
627                 inject_gp(vcpu);
628                 return;
629         }
630         kvm_arch_ops->set_cr4(vcpu, cr4);
631         spin_lock(&vcpu->kvm->lock);
632         kvm_mmu_reset_context(vcpu);
633         spin_unlock(&vcpu->kvm->lock);
634 }
635 EXPORT_SYMBOL_GPL(set_cr4);
636
637 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
638 {
639         if (is_long_mode(vcpu)) {
640                 if (cr3 & CR3_L_MODE_RESEVED_BITS) {
641                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
642                         inject_gp(vcpu);
643                         return;
644                 }
645         } else {
646                 if (cr3 & CR3_RESEVED_BITS) {
647                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
648                         inject_gp(vcpu);
649                         return;
650                 }
651                 if (is_paging(vcpu) && is_pae(vcpu) &&
652                     !load_pdptrs(vcpu, cr3)) {
653                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
654                                "reserved bits\n");
655                         inject_gp(vcpu);
656                         return;
657                 }
658         }
659
660         vcpu->cr3 = cr3;
661         spin_lock(&vcpu->kvm->lock);
662         /*
663          * Does the new cr3 value map to physical memory? (Note, we
664          * catch an invalid cr3 even in real-mode, because it would
665          * cause trouble later on when we turn on paging anyway.)
666          *
667          * A real CPU would silently accept an invalid cr3 and would
668          * attempt to use it - with largely undefined (and often hard
669          * to debug) behavior on the guest side.
670          */
671         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
672                 inject_gp(vcpu);
673         else
674                 vcpu->mmu.new_cr3(vcpu);
675         spin_unlock(&vcpu->kvm->lock);
676 }
677 EXPORT_SYMBOL_GPL(set_cr3);
678
679 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
680 {
681         if ( cr8 & CR8_RESEVED_BITS) {
682                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
683                 inject_gp(vcpu);
684                 return;
685         }
686         vcpu->cr8 = cr8;
687 }
688 EXPORT_SYMBOL_GPL(set_cr8);
689
690 void fx_init(struct kvm_vcpu *vcpu)
691 {
692         struct __attribute__ ((__packed__)) fx_image_s {
693                 u16 control; //fcw
694                 u16 status; //fsw
695                 u16 tag; // ftw
696                 u16 opcode; //fop
697                 u64 ip; // fpu ip
698                 u64 operand;// fpu dp
699                 u32 mxcsr;
700                 u32 mxcsr_mask;
701
702         } *fx_image;
703
704         fx_save(vcpu->host_fx_image);
705         fpu_init();
706         fx_save(vcpu->guest_fx_image);
707         fx_restore(vcpu->host_fx_image);
708
709         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
710         fx_image->mxcsr = 0x1f80;
711         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
712                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
713 }
714 EXPORT_SYMBOL_GPL(fx_init);
715
716 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
717 {
718         spin_lock(&vcpu->kvm->lock);
719         kvm_mmu_slot_remove_write_access(vcpu, slot);
720         spin_unlock(&vcpu->kvm->lock);
721 }
722
723 /*
724  * Allocate some memory and give it an address in the guest physical address
725  * space.
726  *
727  * Discontiguous memory is allowed, mostly for framebuffers.
728  */
729 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
730                                           struct kvm_memory_region *mem)
731 {
732         int r;
733         gfn_t base_gfn;
734         unsigned long npages;
735         unsigned long i;
736         struct kvm_memory_slot *memslot;
737         struct kvm_memory_slot old, new;
738         int memory_config_version;
739
740         r = -EINVAL;
741         /* General sanity checks */
742         if (mem->memory_size & (PAGE_SIZE - 1))
743                 goto out;
744         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
745                 goto out;
746         if (mem->slot >= KVM_MEMORY_SLOTS)
747                 goto out;
748         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
749                 goto out;
750
751         memslot = &kvm->memslots[mem->slot];
752         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
753         npages = mem->memory_size >> PAGE_SHIFT;
754
755         if (!npages)
756                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
757
758 raced:
759         spin_lock(&kvm->lock);
760
761         memory_config_version = kvm->memory_config_version;
762         new = old = *memslot;
763
764         new.base_gfn = base_gfn;
765         new.npages = npages;
766         new.flags = mem->flags;
767
768         /* Disallow changing a memory slot's size. */
769         r = -EINVAL;
770         if (npages && old.npages && npages != old.npages)
771                 goto out_unlock;
772
773         /* Check for overlaps */
774         r = -EEXIST;
775         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
776                 struct kvm_memory_slot *s = &kvm->memslots[i];
777
778                 if (s == memslot)
779                         continue;
780                 if (!((base_gfn + npages <= s->base_gfn) ||
781                       (base_gfn >= s->base_gfn + s->npages)))
782                         goto out_unlock;
783         }
784         /*
785          * Do memory allocations outside lock.  memory_config_version will
786          * detect any races.
787          */
788         spin_unlock(&kvm->lock);
789
790         /* Deallocate if slot is being removed */
791         if (!npages)
792                 new.phys_mem = NULL;
793
794         /* Free page dirty bitmap if unneeded */
795         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
796                 new.dirty_bitmap = NULL;
797
798         r = -ENOMEM;
799
800         /* Allocate if a slot is being created */
801         if (npages && !new.phys_mem) {
802                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
803
804                 if (!new.phys_mem)
805                         goto out_free;
806
807                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
808                 for (i = 0; i < npages; ++i) {
809                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
810                                                      | __GFP_ZERO);
811                         if (!new.phys_mem[i])
812                                 goto out_free;
813                         set_page_private(new.phys_mem[i],0);
814                 }
815         }
816
817         /* Allocate page dirty bitmap if needed */
818         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
819                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
820
821                 new.dirty_bitmap = vmalloc(dirty_bytes);
822                 if (!new.dirty_bitmap)
823                         goto out_free;
824                 memset(new.dirty_bitmap, 0, dirty_bytes);
825         }
826
827         spin_lock(&kvm->lock);
828
829         if (memory_config_version != kvm->memory_config_version) {
830                 spin_unlock(&kvm->lock);
831                 kvm_free_physmem_slot(&new, &old);
832                 goto raced;
833         }
834
835         r = -EAGAIN;
836         if (kvm->busy)
837                 goto out_unlock;
838
839         if (mem->slot >= kvm->nmemslots)
840                 kvm->nmemslots = mem->slot + 1;
841
842         *memslot = new;
843         ++kvm->memory_config_version;
844
845         spin_unlock(&kvm->lock);
846
847         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
848                 struct kvm_vcpu *vcpu;
849
850                 vcpu = vcpu_load_slot(kvm, i);
851                 if (!vcpu)
852                         continue;
853                 if (new.flags & KVM_MEM_LOG_DIRTY_PAGES)
854                         do_remove_write_access(vcpu, mem->slot);
855                 kvm_mmu_reset_context(vcpu);
856                 vcpu_put(vcpu);
857         }
858
859         kvm_free_physmem_slot(&old, &new);
860         return 0;
861
862 out_unlock:
863         spin_unlock(&kvm->lock);
864 out_free:
865         kvm_free_physmem_slot(&new, &old);
866 out:
867         return r;
868 }
869
870 /*
871  * Get (and clear) the dirty memory log for a memory slot.
872  */
873 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
874                                       struct kvm_dirty_log *log)
875 {
876         struct kvm_memory_slot *memslot;
877         int r, i;
878         int n;
879         int cleared;
880         unsigned long any = 0;
881
882         spin_lock(&kvm->lock);
883
884         /*
885          * Prevent changes to guest memory configuration even while the lock
886          * is not taken.
887          */
888         ++kvm->busy;
889         spin_unlock(&kvm->lock);
890         r = -EINVAL;
891         if (log->slot >= KVM_MEMORY_SLOTS)
892                 goto out;
893
894         memslot = &kvm->memslots[log->slot];
895         r = -ENOENT;
896         if (!memslot->dirty_bitmap)
897                 goto out;
898
899         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
900
901         for (i = 0; !any && i < n/sizeof(long); ++i)
902                 any = memslot->dirty_bitmap[i];
903
904         r = -EFAULT;
905         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
906                 goto out;
907
908         if (any) {
909                 cleared = 0;
910                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
911                         struct kvm_vcpu *vcpu;
912
913                         vcpu = vcpu_load_slot(kvm, i);
914                         if (!vcpu)
915                                 continue;
916                         if (!cleared) {
917                                 do_remove_write_access(vcpu, log->slot);
918                                 memset(memslot->dirty_bitmap, 0, n);
919                                 cleared = 1;
920                         }
921                         kvm_arch_ops->tlb_flush(vcpu);
922                         vcpu_put(vcpu);
923                 }
924         }
925
926         r = 0;
927
928 out:
929         spin_lock(&kvm->lock);
930         --kvm->busy;
931         spin_unlock(&kvm->lock);
932         return r;
933 }
934
935 /*
936  * Set a new alias region.  Aliases map a portion of physical memory into
937  * another portion.  This is useful for memory windows, for example the PC
938  * VGA region.
939  */
940 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
941                                          struct kvm_memory_alias *alias)
942 {
943         int r, n;
944         struct kvm_mem_alias *p;
945
946         r = -EINVAL;
947         /* General sanity checks */
948         if (alias->memory_size & (PAGE_SIZE - 1))
949                 goto out;
950         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
951                 goto out;
952         if (alias->slot >= KVM_ALIAS_SLOTS)
953                 goto out;
954         if (alias->guest_phys_addr + alias->memory_size
955             < alias->guest_phys_addr)
956                 goto out;
957         if (alias->target_phys_addr + alias->memory_size
958             < alias->target_phys_addr)
959                 goto out;
960
961         spin_lock(&kvm->lock);
962
963         p = &kvm->aliases[alias->slot];
964         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
965         p->npages = alias->memory_size >> PAGE_SHIFT;
966         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
967
968         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
969                 if (kvm->aliases[n - 1].npages)
970                         break;
971         kvm->naliases = n;
972
973         spin_unlock(&kvm->lock);
974
975         vcpu_load(&kvm->vcpus[0]);
976         spin_lock(&kvm->lock);
977         kvm_mmu_zap_all(&kvm->vcpus[0]);
978         spin_unlock(&kvm->lock);
979         vcpu_put(&kvm->vcpus[0]);
980
981         return 0;
982
983 out:
984         return r;
985 }
986
987 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
988 {
989         int i;
990         struct kvm_mem_alias *alias;
991
992         for (i = 0; i < kvm->naliases; ++i) {
993                 alias = &kvm->aliases[i];
994                 if (gfn >= alias->base_gfn
995                     && gfn < alias->base_gfn + alias->npages)
996                         return alias->target_gfn + gfn - alias->base_gfn;
997         }
998         return gfn;
999 }
1000
1001 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1002 {
1003         int i;
1004
1005         for (i = 0; i < kvm->nmemslots; ++i) {
1006                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
1007
1008                 if (gfn >= memslot->base_gfn
1009                     && gfn < memslot->base_gfn + memslot->npages)
1010                         return memslot;
1011         }
1012         return NULL;
1013 }
1014
1015 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1016 {
1017         gfn = unalias_gfn(kvm, gfn);
1018         return __gfn_to_memslot(kvm, gfn);
1019 }
1020
1021 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1022 {
1023         struct kvm_memory_slot *slot;
1024
1025         gfn = unalias_gfn(kvm, gfn);
1026         slot = __gfn_to_memslot(kvm, gfn);
1027         if (!slot)
1028                 return NULL;
1029         return slot->phys_mem[gfn - slot->base_gfn];
1030 }
1031 EXPORT_SYMBOL_GPL(gfn_to_page);
1032
1033 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1034 {
1035         int i;
1036         struct kvm_memory_slot *memslot;
1037         unsigned long rel_gfn;
1038
1039         for (i = 0; i < kvm->nmemslots; ++i) {
1040                 memslot = &kvm->memslots[i];
1041
1042                 if (gfn >= memslot->base_gfn
1043                     && gfn < memslot->base_gfn + memslot->npages) {
1044
1045                         if (!memslot->dirty_bitmap)
1046                                 return;
1047
1048                         rel_gfn = gfn - memslot->base_gfn;
1049
1050                         /* avoid RMW */
1051                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1052                                 set_bit(rel_gfn, memslot->dirty_bitmap);
1053                         return;
1054                 }
1055         }
1056 }
1057
1058 static int emulator_read_std(unsigned long addr,
1059                              void *val,
1060                              unsigned int bytes,
1061                              struct x86_emulate_ctxt *ctxt)
1062 {
1063         struct kvm_vcpu *vcpu = ctxt->vcpu;
1064         void *data = val;
1065
1066         while (bytes) {
1067                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1068                 unsigned offset = addr & (PAGE_SIZE-1);
1069                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1070                 unsigned long pfn;
1071                 struct page *page;
1072                 void *page_virt;
1073
1074                 if (gpa == UNMAPPED_GVA)
1075                         return X86EMUL_PROPAGATE_FAULT;
1076                 pfn = gpa >> PAGE_SHIFT;
1077                 page = gfn_to_page(vcpu->kvm, pfn);
1078                 if (!page)
1079                         return X86EMUL_UNHANDLEABLE;
1080                 page_virt = kmap_atomic(page, KM_USER0);
1081
1082                 memcpy(data, page_virt + offset, tocopy);
1083
1084                 kunmap_atomic(page_virt, KM_USER0);
1085
1086                 bytes -= tocopy;
1087                 data += tocopy;
1088                 addr += tocopy;
1089         }
1090
1091         return X86EMUL_CONTINUE;
1092 }
1093
1094 static int emulator_write_std(unsigned long addr,
1095                               const void *val,
1096                               unsigned int bytes,
1097                               struct x86_emulate_ctxt *ctxt)
1098 {
1099         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
1100                addr, bytes);
1101         return X86EMUL_UNHANDLEABLE;
1102 }
1103
1104 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1105                                                 gpa_t addr)
1106 {
1107         /*
1108          * Note that its important to have this wrapper function because
1109          * in the very near future we will be checking for MMIOs against
1110          * the LAPIC as well as the general MMIO bus
1111          */
1112         return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1113 }
1114
1115 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1116                                                gpa_t addr)
1117 {
1118         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1119 }
1120
1121 static int emulator_read_emulated(unsigned long addr,
1122                                   void *val,
1123                                   unsigned int bytes,
1124                                   struct x86_emulate_ctxt *ctxt)
1125 {
1126         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1127         struct kvm_io_device *mmio_dev;
1128         gpa_t                 gpa;
1129
1130         if (vcpu->mmio_read_completed) {
1131                 memcpy(val, vcpu->mmio_data, bytes);
1132                 vcpu->mmio_read_completed = 0;
1133                 return X86EMUL_CONTINUE;
1134         } else if (emulator_read_std(addr, val, bytes, ctxt)
1135                    == X86EMUL_CONTINUE)
1136                 return X86EMUL_CONTINUE;
1137
1138         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1139         if (gpa == UNMAPPED_GVA)
1140                 return X86EMUL_PROPAGATE_FAULT;
1141
1142         /*
1143          * Is this MMIO handled locally?
1144          */
1145         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1146         if (mmio_dev) {
1147                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1148                 return X86EMUL_CONTINUE;
1149         }
1150
1151         vcpu->mmio_needed = 1;
1152         vcpu->mmio_phys_addr = gpa;
1153         vcpu->mmio_size = bytes;
1154         vcpu->mmio_is_write = 0;
1155
1156         return X86EMUL_UNHANDLEABLE;
1157 }
1158
1159 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1160                                const void *val, int bytes)
1161 {
1162         struct page *page;
1163         void *virt;
1164         unsigned offset = offset_in_page(gpa);
1165
1166         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1167                 return 0;
1168         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1169         if (!page)
1170                 return 0;
1171         mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1172         virt = kmap_atomic(page, KM_USER0);
1173         kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
1174         memcpy(virt + offset_in_page(gpa), val, bytes);
1175         kunmap_atomic(virt, KM_USER0);
1176         return 1;
1177 }
1178
1179 static int emulator_write_emulated(unsigned long addr,
1180                                    const void *val,
1181                                    unsigned int bytes,
1182                                    struct x86_emulate_ctxt *ctxt)
1183 {
1184         struct kvm_vcpu      *vcpu = ctxt->vcpu;
1185         struct kvm_io_device *mmio_dev;
1186         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1187
1188         if (gpa == UNMAPPED_GVA) {
1189                 kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
1190                 return X86EMUL_PROPAGATE_FAULT;
1191         }
1192
1193         if (emulator_write_phys(vcpu, gpa, val, bytes))
1194                 return X86EMUL_CONTINUE;
1195
1196         /*
1197          * Is this MMIO handled locally?
1198          */
1199         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1200         if (mmio_dev) {
1201                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1202                 return X86EMUL_CONTINUE;
1203         }
1204
1205         vcpu->mmio_needed = 1;
1206         vcpu->mmio_phys_addr = gpa;
1207         vcpu->mmio_size = bytes;
1208         vcpu->mmio_is_write = 1;
1209         memcpy(vcpu->mmio_data, val, bytes);
1210
1211         return X86EMUL_CONTINUE;
1212 }
1213
1214 static int emulator_cmpxchg_emulated(unsigned long addr,
1215                                      const void *old,
1216                                      const void *new,
1217                                      unsigned int bytes,
1218                                      struct x86_emulate_ctxt *ctxt)
1219 {
1220         static int reported;
1221
1222         if (!reported) {
1223                 reported = 1;
1224                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1225         }
1226         return emulator_write_emulated(addr, new, bytes, ctxt);
1227 }
1228
1229 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1230 {
1231         return kvm_arch_ops->get_segment_base(vcpu, seg);
1232 }
1233
1234 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1235 {
1236         return X86EMUL_CONTINUE;
1237 }
1238
1239 int emulate_clts(struct kvm_vcpu *vcpu)
1240 {
1241         unsigned long cr0;
1242
1243         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1244         kvm_arch_ops->set_cr0(vcpu, cr0);
1245         return X86EMUL_CONTINUE;
1246 }
1247
1248 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1249 {
1250         struct kvm_vcpu *vcpu = ctxt->vcpu;
1251
1252         switch (dr) {
1253         case 0 ... 3:
1254                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1255                 return X86EMUL_CONTINUE;
1256         default:
1257                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1258                        __FUNCTION__, dr);
1259                 return X86EMUL_UNHANDLEABLE;
1260         }
1261 }
1262
1263 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1264 {
1265         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1266         int exception;
1267
1268         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1269         if (exception) {
1270                 /* FIXME: better handling */
1271                 return X86EMUL_UNHANDLEABLE;
1272         }
1273         return X86EMUL_CONTINUE;
1274 }
1275
1276 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1277 {
1278         static int reported;
1279         u8 opcodes[4];
1280         unsigned long rip = ctxt->vcpu->rip;
1281         unsigned long rip_linear;
1282
1283         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1284
1285         if (reported)
1286                 return;
1287
1288         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1289
1290         printk(KERN_ERR "emulation failed but !mmio_needed?"
1291                " rip %lx %02x %02x %02x %02x\n",
1292                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1293         reported = 1;
1294 }
1295
1296 struct x86_emulate_ops emulate_ops = {
1297         .read_std            = emulator_read_std,
1298         .write_std           = emulator_write_std,
1299         .read_emulated       = emulator_read_emulated,
1300         .write_emulated      = emulator_write_emulated,
1301         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1302 };
1303
1304 int emulate_instruction(struct kvm_vcpu *vcpu,
1305                         struct kvm_run *run,
1306                         unsigned long cr2,
1307                         u16 error_code)
1308 {
1309         struct x86_emulate_ctxt emulate_ctxt;
1310         int r;
1311         int cs_db, cs_l;
1312
1313         vcpu->mmio_fault_cr2 = cr2;
1314         kvm_arch_ops->cache_regs(vcpu);
1315
1316         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1317
1318         emulate_ctxt.vcpu = vcpu;
1319         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1320         emulate_ctxt.cr2 = cr2;
1321         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1322                 ? X86EMUL_MODE_REAL : cs_l
1323                 ? X86EMUL_MODE_PROT64 : cs_db
1324                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1325
1326         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1327                 emulate_ctxt.cs_base = 0;
1328                 emulate_ctxt.ds_base = 0;
1329                 emulate_ctxt.es_base = 0;
1330                 emulate_ctxt.ss_base = 0;
1331         } else {
1332                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1333                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1334                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1335                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1336         }
1337
1338         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1339         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1340
1341         vcpu->mmio_is_write = 0;
1342         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1343
1344         if ((r || vcpu->mmio_is_write) && run) {
1345                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1346                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1347                 run->mmio.len = vcpu->mmio_size;
1348                 run->mmio.is_write = vcpu->mmio_is_write;
1349         }
1350
1351         if (r) {
1352                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1353                         return EMULATE_DONE;
1354                 if (!vcpu->mmio_needed) {
1355                         report_emulation_failure(&emulate_ctxt);
1356                         return EMULATE_FAIL;
1357                 }
1358                 return EMULATE_DO_MMIO;
1359         }
1360
1361         kvm_arch_ops->decache_regs(vcpu);
1362         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1363
1364         if (vcpu->mmio_is_write) {
1365                 vcpu->mmio_needed = 0;
1366                 return EMULATE_DO_MMIO;
1367         }
1368
1369         return EMULATE_DONE;
1370 }
1371 EXPORT_SYMBOL_GPL(emulate_instruction);
1372
1373 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1374 {
1375         if (vcpu->irq_summary)
1376                 return 1;
1377
1378         vcpu->run->exit_reason = KVM_EXIT_HLT;
1379         ++vcpu->stat.halt_exits;
1380         return 0;
1381 }
1382 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1383
1384 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1385 {
1386         unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1387
1388         kvm_arch_ops->cache_regs(vcpu);
1389         ret = -KVM_EINVAL;
1390 #ifdef CONFIG_X86_64
1391         if (is_long_mode(vcpu)) {
1392                 nr = vcpu->regs[VCPU_REGS_RAX];
1393                 a0 = vcpu->regs[VCPU_REGS_RDI];
1394                 a1 = vcpu->regs[VCPU_REGS_RSI];
1395                 a2 = vcpu->regs[VCPU_REGS_RDX];
1396                 a3 = vcpu->regs[VCPU_REGS_RCX];
1397                 a4 = vcpu->regs[VCPU_REGS_R8];
1398                 a5 = vcpu->regs[VCPU_REGS_R9];
1399         } else
1400 #endif
1401         {
1402                 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1403                 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1404                 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1405                 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1406                 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1407                 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1408                 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1409         }
1410         switch (nr) {
1411         default:
1412                 run->hypercall.args[0] = a0;
1413                 run->hypercall.args[1] = a1;
1414                 run->hypercall.args[2] = a2;
1415                 run->hypercall.args[3] = a3;
1416                 run->hypercall.args[4] = a4;
1417                 run->hypercall.args[5] = a5;
1418                 run->hypercall.ret = ret;
1419                 run->hypercall.longmode = is_long_mode(vcpu);
1420                 kvm_arch_ops->decache_regs(vcpu);
1421                 return 0;
1422         }
1423         vcpu->regs[VCPU_REGS_RAX] = ret;
1424         kvm_arch_ops->decache_regs(vcpu);
1425         return 1;
1426 }
1427 EXPORT_SYMBOL_GPL(kvm_hypercall);
1428
1429 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1430 {
1431         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1432 }
1433
1434 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1435 {
1436         struct descriptor_table dt = { limit, base };
1437
1438         kvm_arch_ops->set_gdt(vcpu, &dt);
1439 }
1440
1441 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1442 {
1443         struct descriptor_table dt = { limit, base };
1444
1445         kvm_arch_ops->set_idt(vcpu, &dt);
1446 }
1447
1448 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1449                    unsigned long *rflags)
1450 {
1451         lmsw(vcpu, msw);
1452         *rflags = kvm_arch_ops->get_rflags(vcpu);
1453 }
1454
1455 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1456 {
1457         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
1458         switch (cr) {
1459         case 0:
1460                 return vcpu->cr0;
1461         case 2:
1462                 return vcpu->cr2;
1463         case 3:
1464                 return vcpu->cr3;
1465         case 4:
1466                 return vcpu->cr4;
1467         default:
1468                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1469                 return 0;
1470         }
1471 }
1472
1473 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1474                      unsigned long *rflags)
1475 {
1476         switch (cr) {
1477         case 0:
1478                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1479                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1480                 break;
1481         case 2:
1482                 vcpu->cr2 = val;
1483                 break;
1484         case 3:
1485                 set_cr3(vcpu, val);
1486                 break;
1487         case 4:
1488                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1489                 break;
1490         default:
1491                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1492         }
1493 }
1494
1495 /*
1496  * Register the para guest with the host:
1497  */
1498 static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1499 {
1500         struct kvm_vcpu_para_state *para_state;
1501         hpa_t para_state_hpa, hypercall_hpa;
1502         struct page *para_state_page;
1503         unsigned char *hypercall;
1504         gpa_t hypercall_gpa;
1505
1506         printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1507         printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1508
1509         /*
1510          * Needs to be page aligned:
1511          */
1512         if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1513                 goto err_gp;
1514
1515         para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1516         printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1517         if (is_error_hpa(para_state_hpa))
1518                 goto err_gp;
1519
1520         mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1521         para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1522         para_state = kmap_atomic(para_state_page, KM_USER0);
1523
1524         printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1525         printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1526
1527         para_state->host_version = KVM_PARA_API_VERSION;
1528         /*
1529          * We cannot support guests that try to register themselves
1530          * with a newer API version than the host supports:
1531          */
1532         if (para_state->guest_version > KVM_PARA_API_VERSION) {
1533                 para_state->ret = -KVM_EINVAL;
1534                 goto err_kunmap_skip;
1535         }
1536
1537         hypercall_gpa = para_state->hypercall_gpa;
1538         hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1539         printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1540         if (is_error_hpa(hypercall_hpa)) {
1541                 para_state->ret = -KVM_EINVAL;
1542                 goto err_kunmap_skip;
1543         }
1544
1545         printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1546         vcpu->para_state_page = para_state_page;
1547         vcpu->para_state_gpa = para_state_gpa;
1548         vcpu->hypercall_gpa = hypercall_gpa;
1549
1550         mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1551         hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1552                                 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1553         kvm_arch_ops->patch_hypercall(vcpu, hypercall);
1554         kunmap_atomic(hypercall, KM_USER1);
1555
1556         para_state->ret = 0;
1557 err_kunmap_skip:
1558         kunmap_atomic(para_state, KM_USER0);
1559         return 0;
1560 err_gp:
1561         return 1;
1562 }
1563
1564 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1565 {
1566         u64 data;
1567
1568         switch (msr) {
1569         case 0xc0010010: /* SYSCFG */
1570         case 0xc0010015: /* HWCR */
1571         case MSR_IA32_PLATFORM_ID:
1572         case MSR_IA32_P5_MC_ADDR:
1573         case MSR_IA32_P5_MC_TYPE:
1574         case MSR_IA32_MC0_CTL:
1575         case MSR_IA32_MCG_STATUS:
1576         case MSR_IA32_MCG_CAP:
1577         case MSR_IA32_MC0_MISC:
1578         case MSR_IA32_MC0_MISC+4:
1579         case MSR_IA32_MC0_MISC+8:
1580         case MSR_IA32_MC0_MISC+12:
1581         case MSR_IA32_MC0_MISC+16:
1582         case MSR_IA32_UCODE_REV:
1583         case MSR_IA32_PERF_STATUS:
1584         case MSR_IA32_EBL_CR_POWERON:
1585                 /* MTRR registers */
1586         case 0xfe:
1587         case 0x200 ... 0x2ff:
1588                 data = 0;
1589                 break;
1590         case 0xcd: /* fsb frequency */
1591                 data = 3;
1592                 break;
1593         case MSR_IA32_APICBASE:
1594                 data = vcpu->apic_base;
1595                 break;
1596         case MSR_IA32_MISC_ENABLE:
1597                 data = vcpu->ia32_misc_enable_msr;
1598                 break;
1599 #ifdef CONFIG_X86_64
1600         case MSR_EFER:
1601                 data = vcpu->shadow_efer;
1602                 break;
1603 #endif
1604         default:
1605                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1606                 return 1;
1607         }
1608         *pdata = data;
1609         return 0;
1610 }
1611 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1612
1613 /*
1614  * Reads an msr value (of 'msr_index') into 'pdata'.
1615  * Returns 0 on success, non-0 otherwise.
1616  * Assumes vcpu_load() was already called.
1617  */
1618 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1619 {
1620         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1621 }
1622
1623 #ifdef CONFIG_X86_64
1624
1625 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1626 {
1627         if (efer & EFER_RESERVED_BITS) {
1628                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1629                        efer);
1630                 inject_gp(vcpu);
1631                 return;
1632         }
1633
1634         if (is_paging(vcpu)
1635             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1636                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1637                 inject_gp(vcpu);
1638                 return;
1639         }
1640
1641         kvm_arch_ops->set_efer(vcpu, efer);
1642
1643         efer &= ~EFER_LMA;
1644         efer |= vcpu->shadow_efer & EFER_LMA;
1645
1646         vcpu->shadow_efer = efer;
1647 }
1648
1649 #endif
1650
1651 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1652 {
1653         switch (msr) {
1654 #ifdef CONFIG_X86_64
1655         case MSR_EFER:
1656                 set_efer(vcpu, data);
1657                 break;
1658 #endif
1659         case MSR_IA32_MC0_STATUS:
1660                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1661                        __FUNCTION__, data);
1662                 break;
1663         case MSR_IA32_MCG_STATUS:
1664                 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1665                         __FUNCTION__, data);
1666                 break;
1667         case MSR_IA32_UCODE_REV:
1668         case MSR_IA32_UCODE_WRITE:
1669         case 0x200 ... 0x2ff: /* MTRRs */
1670                 break;
1671         case MSR_IA32_APICBASE:
1672                 vcpu->apic_base = data;
1673                 break;
1674         case MSR_IA32_MISC_ENABLE:
1675                 vcpu->ia32_misc_enable_msr = data;
1676                 break;
1677         /*
1678          * This is the 'probe whether the host is KVM' logic:
1679          */
1680         case MSR_KVM_API_MAGIC:
1681                 return vcpu_register_para(vcpu, data);
1682
1683         default:
1684                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1685                 return 1;
1686         }
1687         return 0;
1688 }
1689 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1690
1691 /*
1692  * Writes msr value into into the appropriate "register".
1693  * Returns 0 on success, non-0 otherwise.
1694  * Assumes vcpu_load() was already called.
1695  */
1696 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1697 {
1698         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1699 }
1700
1701 void kvm_resched(struct kvm_vcpu *vcpu)
1702 {
1703         if (!need_resched())
1704                 return;
1705         vcpu_put(vcpu);
1706         cond_resched();
1707         vcpu_load(vcpu);
1708 }
1709 EXPORT_SYMBOL_GPL(kvm_resched);
1710
1711 void load_msrs(struct vmx_msr_entry *e, int n)
1712 {
1713         int i;
1714
1715         for (i = 0; i < n; ++i)
1716                 wrmsrl(e[i].index, e[i].data);
1717 }
1718 EXPORT_SYMBOL_GPL(load_msrs);
1719
1720 void save_msrs(struct vmx_msr_entry *e, int n)
1721 {
1722         int i;
1723
1724         for (i = 0; i < n; ++i)
1725                 rdmsrl(e[i].index, e[i].data);
1726 }
1727 EXPORT_SYMBOL_GPL(save_msrs);
1728
1729 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1730 {
1731         int i;
1732         u32 function;
1733         struct kvm_cpuid_entry *e, *best;
1734
1735         kvm_arch_ops->cache_regs(vcpu);
1736         function = vcpu->regs[VCPU_REGS_RAX];
1737         vcpu->regs[VCPU_REGS_RAX] = 0;
1738         vcpu->regs[VCPU_REGS_RBX] = 0;
1739         vcpu->regs[VCPU_REGS_RCX] = 0;
1740         vcpu->regs[VCPU_REGS_RDX] = 0;
1741         best = NULL;
1742         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1743                 e = &vcpu->cpuid_entries[i];
1744                 if (e->function == function) {
1745                         best = e;
1746                         break;
1747                 }
1748                 /*
1749                  * Both basic or both extended?
1750                  */
1751                 if (((e->function ^ function) & 0x80000000) == 0)
1752                         if (!best || e->function > best->function)
1753                                 best = e;
1754         }
1755         if (best) {
1756                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1757                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1758                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1759                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1760         }
1761         kvm_arch_ops->decache_regs(vcpu);
1762         kvm_arch_ops->skip_emulated_instruction(vcpu);
1763 }
1764 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1765
1766 static int pio_copy_data(struct kvm_vcpu *vcpu)
1767 {
1768         void *p = vcpu->pio_data;
1769         void *q;
1770         unsigned bytes;
1771         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1772
1773         kvm_arch_ops->vcpu_put(vcpu);
1774         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1775                  PAGE_KERNEL);
1776         if (!q) {
1777                 kvm_arch_ops->vcpu_load(vcpu);
1778                 free_pio_guest_pages(vcpu);
1779                 return -ENOMEM;
1780         }
1781         q += vcpu->pio.guest_page_offset;
1782         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1783         if (vcpu->pio.in)
1784                 memcpy(q, p, bytes);
1785         else
1786                 memcpy(p, q, bytes);
1787         q -= vcpu->pio.guest_page_offset;
1788         vunmap(q);
1789         kvm_arch_ops->vcpu_load(vcpu);
1790         free_pio_guest_pages(vcpu);
1791         return 0;
1792 }
1793
1794 static int complete_pio(struct kvm_vcpu *vcpu)
1795 {
1796         struct kvm_pio_request *io = &vcpu->pio;
1797         long delta;
1798         int r;
1799
1800         kvm_arch_ops->cache_regs(vcpu);
1801
1802         if (!io->string) {
1803                 if (io->in)
1804                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1805                                io->size);
1806         } else {
1807                 if (io->in) {
1808                         r = pio_copy_data(vcpu);
1809                         if (r) {
1810                                 kvm_arch_ops->cache_regs(vcpu);
1811                                 return r;
1812                         }
1813                 }
1814
1815                 delta = 1;
1816                 if (io->rep) {
1817                         delta *= io->cur_count;
1818                         /*
1819                          * The size of the register should really depend on
1820                          * current address size.
1821                          */
1822                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1823                 }
1824                 if (io->down)
1825                         delta = -delta;
1826                 delta *= io->size;
1827                 if (io->in)
1828                         vcpu->regs[VCPU_REGS_RDI] += delta;
1829                 else
1830                         vcpu->regs[VCPU_REGS_RSI] += delta;
1831         }
1832
1833         kvm_arch_ops->decache_regs(vcpu);
1834
1835         io->count -= io->cur_count;
1836         io->cur_count = 0;
1837
1838         if (!io->count)
1839                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1840         return 0;
1841 }
1842
1843 void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu)
1844 {
1845         /* TODO: String I/O for in kernel device */
1846
1847         if (vcpu->pio.in)
1848                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1849                                   vcpu->pio.size,
1850                                   vcpu->pio_data);
1851         else
1852                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1853                                    vcpu->pio.size,
1854                                    vcpu->pio_data);
1855 }
1856
1857 int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1858                   int size, unsigned long count, int string, int down,
1859                   gva_t address, int rep, unsigned port)
1860 {
1861         unsigned now, in_page;
1862         int i;
1863         int nr_pages = 1;
1864         struct page *page;
1865         struct kvm_io_device *pio_dev;
1866
1867         vcpu->run->exit_reason = KVM_EXIT_IO;
1868         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1869         vcpu->run->io.size = size;
1870         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1871         vcpu->run->io.count = count;
1872         vcpu->run->io.port = port;
1873         vcpu->pio.count = count;
1874         vcpu->pio.cur_count = count;
1875         vcpu->pio.size = size;
1876         vcpu->pio.in = in;
1877         vcpu->pio.port = port;
1878         vcpu->pio.string = string;
1879         vcpu->pio.down = down;
1880         vcpu->pio.guest_page_offset = offset_in_page(address);
1881         vcpu->pio.rep = rep;
1882
1883         pio_dev = vcpu_find_pio_dev(vcpu, port);
1884         if (!string) {
1885                 kvm_arch_ops->cache_regs(vcpu);
1886                 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1887                 kvm_arch_ops->decache_regs(vcpu);
1888                 if (pio_dev) {
1889                         kernel_pio(pio_dev, vcpu);
1890                         complete_pio(vcpu);
1891                         return 1;
1892                 }
1893                 return 0;
1894         }
1895         /* TODO: String I/O for in kernel device */
1896         if (pio_dev)
1897                 printk(KERN_ERR "kvm_setup_pio: no string io support\n");
1898
1899         if (!count) {
1900                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1901                 return 1;
1902         }
1903
1904         now = min(count, PAGE_SIZE / size);
1905
1906         if (!down)
1907                 in_page = PAGE_SIZE - offset_in_page(address);
1908         else
1909                 in_page = offset_in_page(address) + size;
1910         now = min(count, (unsigned long)in_page / size);
1911         if (!now) {
1912                 /*
1913                  * String I/O straddles page boundary.  Pin two guest pages
1914                  * so that we satisfy atomicity constraints.  Do just one
1915                  * transaction to avoid complexity.
1916                  */
1917                 nr_pages = 2;
1918                 now = 1;
1919         }
1920         if (down) {
1921                 /*
1922                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1923                  */
1924                 printk(KERN_ERR "kvm: guest string pio down\n");
1925                 inject_gp(vcpu);
1926                 return 1;
1927         }
1928         vcpu->run->io.count = now;
1929         vcpu->pio.cur_count = now;
1930
1931         for (i = 0; i < nr_pages; ++i) {
1932                 spin_lock(&vcpu->kvm->lock);
1933                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1934                 if (page)
1935                         get_page(page);
1936                 vcpu->pio.guest_pages[i] = page;
1937                 spin_unlock(&vcpu->kvm->lock);
1938                 if (!page) {
1939                         inject_gp(vcpu);
1940                         free_pio_guest_pages(vcpu);
1941                         return 1;
1942                 }
1943         }
1944
1945         if (!vcpu->pio.in)
1946                 return pio_copy_data(vcpu);
1947         return 0;
1948 }
1949 EXPORT_SYMBOL_GPL(kvm_setup_pio);
1950
1951 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1952 {
1953         int r;
1954         sigset_t sigsaved;
1955
1956         vcpu_load(vcpu);
1957
1958         if (vcpu->sigset_active)
1959                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1960
1961         /* re-sync apic's tpr */
1962         vcpu->cr8 = kvm_run->cr8;
1963
1964         if (vcpu->pio.cur_count) {
1965                 r = complete_pio(vcpu);
1966                 if (r)
1967                         goto out;
1968         }
1969
1970         if (vcpu->mmio_needed) {
1971                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1972                 vcpu->mmio_read_completed = 1;
1973                 vcpu->mmio_needed = 0;
1974                 r = emulate_instruction(vcpu, kvm_run,
1975                                         vcpu->mmio_fault_cr2, 0);
1976                 if (r == EMULATE_DO_MMIO) {
1977                         /*
1978                          * Read-modify-write.  Back to userspace.
1979                          */
1980                         kvm_run->exit_reason = KVM_EXIT_MMIO;
1981                         r = 0;
1982                         goto out;
1983                 }
1984         }
1985
1986         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1987                 kvm_arch_ops->cache_regs(vcpu);
1988                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1989                 kvm_arch_ops->decache_regs(vcpu);
1990         }
1991
1992         r = kvm_arch_ops->run(vcpu, kvm_run);
1993
1994 out:
1995         if (vcpu->sigset_active)
1996                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1997
1998         vcpu_put(vcpu);
1999         return r;
2000 }
2001
2002 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2003                                    struct kvm_regs *regs)
2004 {
2005         vcpu_load(vcpu);
2006
2007         kvm_arch_ops->cache_regs(vcpu);
2008
2009         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2010         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2011         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2012         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2013         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2014         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2015         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2016         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2017 #ifdef CONFIG_X86_64
2018         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2019         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2020         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2021         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2022         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2023         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2024         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2025         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2026 #endif
2027
2028         regs->rip = vcpu->rip;
2029         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
2030
2031         /*
2032          * Don't leak debug flags in case they were set for guest debugging
2033          */
2034         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2035                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2036
2037         vcpu_put(vcpu);
2038
2039         return 0;
2040 }
2041
2042 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2043                                    struct kvm_regs *regs)
2044 {
2045         vcpu_load(vcpu);
2046
2047         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2048         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2049         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2050         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2051         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2052         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2053         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2054         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2055 #ifdef CONFIG_X86_64
2056         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2057         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2058         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2059         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2060         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2061         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2062         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2063         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2064 #endif
2065
2066         vcpu->rip = regs->rip;
2067         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
2068
2069         kvm_arch_ops->decache_regs(vcpu);
2070
2071         vcpu_put(vcpu);
2072
2073         return 0;
2074 }
2075
2076 static void get_segment(struct kvm_vcpu *vcpu,
2077                         struct kvm_segment *var, int seg)
2078 {
2079         return kvm_arch_ops->get_segment(vcpu, var, seg);
2080 }
2081
2082 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2083                                     struct kvm_sregs *sregs)
2084 {
2085         struct descriptor_table dt;
2086
2087         vcpu_load(vcpu);
2088
2089         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2090         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2091         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2092         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2093         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2094         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2095
2096         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2097         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2098
2099         kvm_arch_ops->get_idt(vcpu, &dt);
2100         sregs->idt.limit = dt.limit;
2101         sregs->idt.base = dt.base;
2102         kvm_arch_ops->get_gdt(vcpu, &dt);
2103         sregs->gdt.limit = dt.limit;
2104         sregs->gdt.base = dt.base;
2105
2106         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2107         sregs->cr0 = vcpu->cr0;
2108         sregs->cr2 = vcpu->cr2;
2109         sregs->cr3 = vcpu->cr3;
2110         sregs->cr4 = vcpu->cr4;
2111         sregs->cr8 = vcpu->cr8;
2112         sregs->efer = vcpu->shadow_efer;
2113         sregs->apic_base = vcpu->apic_base;
2114
2115         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2116                sizeof sregs->interrupt_bitmap);
2117
2118         vcpu_put(vcpu);
2119
2120         return 0;
2121 }
2122
2123 static void set_segment(struct kvm_vcpu *vcpu,
2124                         struct kvm_segment *var, int seg)
2125 {
2126         return kvm_arch_ops->set_segment(vcpu, var, seg);
2127 }
2128
2129 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2130                                     struct kvm_sregs *sregs)
2131 {
2132         int mmu_reset_needed = 0;
2133         int i;
2134         struct descriptor_table dt;
2135
2136         vcpu_load(vcpu);
2137
2138         dt.limit = sregs->idt.limit;
2139         dt.base = sregs->idt.base;
2140         kvm_arch_ops->set_idt(vcpu, &dt);
2141         dt.limit = sregs->gdt.limit;
2142         dt.base = sregs->gdt.base;
2143         kvm_arch_ops->set_gdt(vcpu, &dt);
2144
2145         vcpu->cr2 = sregs->cr2;
2146         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2147         vcpu->cr3 = sregs->cr3;
2148
2149         vcpu->cr8 = sregs->cr8;
2150
2151         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2152 #ifdef CONFIG_X86_64
2153         kvm_arch_ops->set_efer(vcpu, sregs->efer);
2154 #endif
2155         vcpu->apic_base = sregs->apic_base;
2156
2157         kvm_arch_ops->decache_cr4_guest_bits(vcpu);
2158
2159         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2160         kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
2161
2162         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2163         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
2164         if (!is_long_mode(vcpu) && is_pae(vcpu))
2165                 load_pdptrs(vcpu, vcpu->cr3);
2166
2167         if (mmu_reset_needed)
2168                 kvm_mmu_reset_context(vcpu);
2169
2170         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2171                sizeof vcpu->irq_pending);
2172         vcpu->irq_summary = 0;
2173         for (i = 0; i < NR_IRQ_WORDS; ++i)
2174                 if (vcpu->irq_pending[i])
2175                         __set_bit(i, &vcpu->irq_summary);
2176
2177         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2178         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2179         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2180         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2181         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2182         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2183
2184         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2185         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2186
2187         vcpu_put(vcpu);
2188
2189         return 0;
2190 }
2191
2192 /*
2193  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2194  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2195  *
2196  * This list is modified at module load time to reflect the
2197  * capabilities of the host cpu.
2198  */
2199 static u32 msrs_to_save[] = {
2200         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2201         MSR_K6_STAR,
2202 #ifdef CONFIG_X86_64
2203         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2204 #endif
2205         MSR_IA32_TIME_STAMP_COUNTER,
2206 };
2207
2208 static unsigned num_msrs_to_save;
2209
2210 static u32 emulated_msrs[] = {
2211         MSR_IA32_MISC_ENABLE,
2212 };
2213
2214 static __init void kvm_init_msr_list(void)
2215 {
2216         u32 dummy[2];
2217         unsigned i, j;
2218
2219         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2220                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2221                         continue;
2222                 if (j < i)
2223                         msrs_to_save[j] = msrs_to_save[i];
2224                 j++;
2225         }
2226         num_msrs_to_save = j;
2227 }
2228
2229 /*
2230  * Adapt set_msr() to msr_io()'s calling convention
2231  */
2232 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2233 {
2234         return set_msr(vcpu, index, *data);
2235 }
2236
2237 /*
2238  * Read or write a bunch of msrs. All parameters are kernel addresses.
2239  *
2240  * @return number of msrs set successfully.
2241  */
2242 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2243                     struct kvm_msr_entry *entries,
2244                     int (*do_msr)(struct kvm_vcpu *vcpu,
2245                                   unsigned index, u64 *data))
2246 {
2247         int i;
2248
2249         vcpu_load(vcpu);
2250
2251         for (i = 0; i < msrs->nmsrs; ++i)
2252                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2253                         break;
2254
2255         vcpu_put(vcpu);
2256
2257         return i;
2258 }
2259
2260 /*
2261  * Read or write a bunch of msrs. Parameters are user addresses.
2262  *
2263  * @return number of msrs set successfully.
2264  */
2265 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2266                   int (*do_msr)(struct kvm_vcpu *vcpu,
2267                                 unsigned index, u64 *data),
2268                   int writeback)
2269 {
2270         struct kvm_msrs msrs;
2271         struct kvm_msr_entry *entries;
2272         int r, n;
2273         unsigned size;
2274
2275         r = -EFAULT;
2276         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2277                 goto out;
2278
2279         r = -E2BIG;
2280         if (msrs.nmsrs >= MAX_IO_MSRS)
2281                 goto out;
2282
2283         r = -ENOMEM;
2284         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2285         entries = vmalloc(size);
2286         if (!entries)
2287                 goto out;
2288
2289         r = -EFAULT;
2290         if (copy_from_user(entries, user_msrs->entries, size))
2291                 goto out_free;
2292
2293         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2294         if (r < 0)
2295                 goto out_free;
2296
2297         r = -EFAULT;
2298         if (writeback && copy_to_user(user_msrs->entries, entries, size))
2299                 goto out_free;
2300
2301         r = n;
2302
2303 out_free:
2304         vfree(entries);
2305 out:
2306         return r;
2307 }
2308
2309 /*
2310  * Translate a guest virtual address to a guest physical address.
2311  */
2312 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2313                                     struct kvm_translation *tr)
2314 {
2315         unsigned long vaddr = tr->linear_address;
2316         gpa_t gpa;
2317
2318         vcpu_load(vcpu);
2319         spin_lock(&vcpu->kvm->lock);
2320         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2321         tr->physical_address = gpa;
2322         tr->valid = gpa != UNMAPPED_GVA;
2323         tr->writeable = 1;
2324         tr->usermode = 0;
2325         spin_unlock(&vcpu->kvm->lock);
2326         vcpu_put(vcpu);
2327
2328         return 0;
2329 }
2330
2331 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2332                                     struct kvm_interrupt *irq)
2333 {
2334         if (irq->irq < 0 || irq->irq >= 256)
2335                 return -EINVAL;
2336         vcpu_load(vcpu);
2337
2338         set_bit(irq->irq, vcpu->irq_pending);
2339         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2340
2341         vcpu_put(vcpu);
2342
2343         return 0;
2344 }
2345
2346 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2347                                       struct kvm_debug_guest *dbg)
2348 {
2349         int r;
2350
2351         vcpu_load(vcpu);
2352
2353         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
2354
2355         vcpu_put(vcpu);
2356
2357         return r;
2358 }
2359
2360 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2361                                     unsigned long address,
2362                                     int *type)
2363 {
2364         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2365         unsigned long pgoff;
2366         struct page *page;
2367
2368         *type = VM_FAULT_MINOR;
2369         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2370         if (pgoff == 0)
2371                 page = virt_to_page(vcpu->run);
2372         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2373                 page = virt_to_page(vcpu->pio_data);
2374         else
2375                 return NOPAGE_SIGBUS;
2376         get_page(page);
2377         return page;
2378 }
2379
2380 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2381         .nopage = kvm_vcpu_nopage,
2382 };
2383
2384 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2385 {
2386         vma->vm_ops = &kvm_vcpu_vm_ops;
2387         return 0;
2388 }
2389
2390 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2391 {
2392         struct kvm_vcpu *vcpu = filp->private_data;
2393
2394         fput(vcpu->kvm->filp);
2395         return 0;
2396 }
2397
2398 static struct file_operations kvm_vcpu_fops = {
2399         .release        = kvm_vcpu_release,
2400         .unlocked_ioctl = kvm_vcpu_ioctl,
2401         .compat_ioctl   = kvm_vcpu_ioctl,
2402         .mmap           = kvm_vcpu_mmap,
2403 };
2404
2405 /*
2406  * Allocates an inode for the vcpu.
2407  */
2408 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2409 {
2410         int fd, r;
2411         struct inode *inode;
2412         struct file *file;
2413
2414         atomic_inc(&vcpu->kvm->filp->f_count);
2415         inode = kvmfs_inode(&kvm_vcpu_fops);
2416         if (IS_ERR(inode)) {
2417                 r = PTR_ERR(inode);
2418                 goto out1;
2419         }
2420
2421         file = kvmfs_file(inode, vcpu);
2422         if (IS_ERR(file)) {
2423                 r = PTR_ERR(file);
2424                 goto out2;
2425         }
2426
2427         r = get_unused_fd();
2428         if (r < 0)
2429                 goto out3;
2430         fd = r;
2431         fd_install(fd, file);
2432
2433         return fd;
2434
2435 out3:
2436         fput(file);
2437 out2:
2438         iput(inode);
2439 out1:
2440         fput(vcpu->kvm->filp);
2441         return r;
2442 }
2443
2444 /*
2445  * Creates some virtual cpus.  Good luck creating more than one.
2446  */
2447 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2448 {
2449         int r;
2450         struct kvm_vcpu *vcpu;
2451         struct page *page;
2452
2453         r = -EINVAL;
2454         if (!valid_vcpu(n))
2455                 goto out;
2456
2457         vcpu = &kvm->vcpus[n];
2458
2459         mutex_lock(&vcpu->mutex);
2460
2461         if (vcpu->vmcs) {
2462                 mutex_unlock(&vcpu->mutex);
2463                 return -EEXIST;
2464         }
2465
2466         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2467         r = -ENOMEM;
2468         if (!page)
2469                 goto out_unlock;
2470         vcpu->run = page_address(page);
2471
2472         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2473         r = -ENOMEM;
2474         if (!page)
2475                 goto out_free_run;
2476         vcpu->pio_data = page_address(page);
2477
2478         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
2479                                            FX_IMAGE_ALIGN);
2480         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
2481         vcpu->cr0 = 0x10;
2482
2483         r = kvm_arch_ops->vcpu_create(vcpu);
2484         if (r < 0)
2485                 goto out_free_vcpus;
2486
2487         r = kvm_mmu_create(vcpu);
2488         if (r < 0)
2489                 goto out_free_vcpus;
2490
2491         kvm_arch_ops->vcpu_load(vcpu);
2492         r = kvm_mmu_setup(vcpu);
2493         if (r >= 0)
2494                 r = kvm_arch_ops->vcpu_setup(vcpu);
2495         vcpu_put(vcpu);
2496
2497         if (r < 0)
2498                 goto out_free_vcpus;
2499
2500         r = create_vcpu_fd(vcpu);
2501         if (r < 0)
2502                 goto out_free_vcpus;
2503
2504         spin_lock(&kvm_lock);
2505         if (n >= kvm->nvcpus)
2506                 kvm->nvcpus = n + 1;
2507         spin_unlock(&kvm_lock);
2508
2509         return r;
2510
2511 out_free_vcpus:
2512         kvm_free_vcpu(vcpu);
2513 out_free_run:
2514         free_page((unsigned long)vcpu->run);
2515         vcpu->run = NULL;
2516 out_unlock:
2517         mutex_unlock(&vcpu->mutex);
2518 out:
2519         return r;
2520 }
2521
2522 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2523 {
2524         u64 efer;
2525         int i;
2526         struct kvm_cpuid_entry *e, *entry;
2527
2528         rdmsrl(MSR_EFER, efer);
2529         entry = NULL;
2530         for (i = 0; i < vcpu->cpuid_nent; ++i) {
2531                 e = &vcpu->cpuid_entries[i];
2532                 if (e->function == 0x80000001) {
2533                         entry = e;
2534                         break;
2535                 }
2536         }
2537         if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) {
2538                 entry->edx &= ~(1 << 20);
2539                 printk(KERN_INFO ": guest NX capability removed\n");
2540         }
2541 }
2542
2543 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2544                                     struct kvm_cpuid *cpuid,
2545                                     struct kvm_cpuid_entry __user *entries)
2546 {
2547         int r;
2548
2549         r = -E2BIG;
2550         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2551                 goto out;
2552         r = -EFAULT;
2553         if (copy_from_user(&vcpu->cpuid_entries, entries,
2554                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2555                 goto out;
2556         vcpu->cpuid_nent = cpuid->nent;
2557         cpuid_fix_nx_cap(vcpu);
2558         return 0;
2559
2560 out:
2561         return r;
2562 }
2563
2564 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2565 {
2566         if (sigset) {
2567                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2568                 vcpu->sigset_active = 1;
2569                 vcpu->sigset = *sigset;
2570         } else
2571                 vcpu->sigset_active = 0;
2572         return 0;
2573 }
2574
2575 /*
2576  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2577  * we have asm/x86/processor.h
2578  */
2579 struct fxsave {
2580         u16     cwd;
2581         u16     swd;
2582         u16     twd;
2583         u16     fop;
2584         u64     rip;
2585         u64     rdp;
2586         u32     mxcsr;
2587         u32     mxcsr_mask;
2588         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2589 #ifdef CONFIG_X86_64
2590         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2591 #else
2592         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2593 #endif
2594 };
2595
2596 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2597 {
2598         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2599
2600         vcpu_load(vcpu);
2601
2602         memcpy(fpu->fpr, fxsave->st_space, 128);
2603         fpu->fcw = fxsave->cwd;
2604         fpu->fsw = fxsave->swd;
2605         fpu->ftwx = fxsave->twd;
2606         fpu->last_opcode = fxsave->fop;
2607         fpu->last_ip = fxsave->rip;
2608         fpu->last_dp = fxsave->rdp;
2609         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2610
2611         vcpu_put(vcpu);
2612
2613         return 0;
2614 }
2615
2616 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2617 {
2618         struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image;
2619
2620         vcpu_load(vcpu);
2621
2622         memcpy(fxsave->st_space, fpu->fpr, 128);
2623         fxsave->cwd = fpu->fcw;
2624         fxsave->swd = fpu->fsw;
2625         fxsave->twd = fpu->ftwx;
2626         fxsave->fop = fpu->last_opcode;
2627         fxsave->rip = fpu->last_ip;
2628         fxsave->rdp = fpu->last_dp;
2629         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2630
2631         vcpu_put(vcpu);
2632
2633         return 0;
2634 }
2635
2636 static long kvm_vcpu_ioctl(struct file *filp,
2637                            unsigned int ioctl, unsigned long arg)
2638 {
2639         struct kvm_vcpu *vcpu = filp->private_data;
2640         void __user *argp = (void __user *)arg;
2641         int r = -EINVAL;
2642
2643         switch (ioctl) {
2644         case KVM_RUN:
2645                 r = -EINVAL;
2646                 if (arg)
2647                         goto out;
2648                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2649                 break;
2650         case KVM_GET_REGS: {
2651                 struct kvm_regs kvm_regs;
2652
2653                 memset(&kvm_regs, 0, sizeof kvm_regs);
2654                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2655                 if (r)
2656                         goto out;
2657                 r = -EFAULT;
2658                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2659                         goto out;
2660                 r = 0;
2661                 break;
2662         }
2663         case KVM_SET_REGS: {
2664                 struct kvm_regs kvm_regs;
2665
2666                 r = -EFAULT;
2667                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2668                         goto out;
2669                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2670                 if (r)
2671                         goto out;
2672                 r = 0;
2673                 break;
2674         }
2675         case KVM_GET_SREGS: {
2676                 struct kvm_sregs kvm_sregs;
2677
2678                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2679                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2680                 if (r)
2681                         goto out;
2682                 r = -EFAULT;
2683                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2684                         goto out;
2685                 r = 0;
2686                 break;
2687         }
2688         case KVM_SET_SREGS: {
2689                 struct kvm_sregs kvm_sregs;
2690
2691                 r = -EFAULT;
2692                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2693                         goto out;
2694                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2695                 if (r)
2696                         goto out;
2697                 r = 0;
2698                 break;
2699         }
2700         case KVM_TRANSLATE: {
2701                 struct kvm_translation tr;
2702
2703                 r = -EFAULT;
2704                 if (copy_from_user(&tr, argp, sizeof tr))
2705                         goto out;
2706                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2707                 if (r)
2708                         goto out;
2709                 r = -EFAULT;
2710                 if (copy_to_user(argp, &tr, sizeof tr))
2711                         goto out;
2712                 r = 0;
2713                 break;
2714         }
2715         case KVM_INTERRUPT: {
2716                 struct kvm_interrupt irq;
2717
2718                 r = -EFAULT;
2719                 if (copy_from_user(&irq, argp, sizeof irq))
2720                         goto out;
2721                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2722                 if (r)
2723                         goto out;
2724                 r = 0;
2725                 break;
2726         }
2727         case KVM_DEBUG_GUEST: {
2728                 struct kvm_debug_guest dbg;
2729
2730                 r = -EFAULT;
2731                 if (copy_from_user(&dbg, argp, sizeof dbg))
2732                         goto out;
2733                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2734                 if (r)
2735                         goto out;
2736                 r = 0;
2737                 break;
2738         }
2739         case KVM_GET_MSRS:
2740                 r = msr_io(vcpu, argp, get_msr, 1);
2741                 break;
2742         case KVM_SET_MSRS:
2743                 r = msr_io(vcpu, argp, do_set_msr, 0);
2744                 break;
2745         case KVM_SET_CPUID: {
2746                 struct kvm_cpuid __user *cpuid_arg = argp;
2747                 struct kvm_cpuid cpuid;
2748
2749                 r = -EFAULT;
2750                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2751                         goto out;
2752                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2753                 if (r)
2754                         goto out;
2755                 break;
2756         }
2757         case KVM_SET_SIGNAL_MASK: {
2758                 struct kvm_signal_mask __user *sigmask_arg = argp;
2759                 struct kvm_signal_mask kvm_sigmask;
2760                 sigset_t sigset, *p;
2761
2762                 p = NULL;
2763                 if (argp) {
2764                         r = -EFAULT;
2765                         if (copy_from_user(&kvm_sigmask, argp,
2766                                            sizeof kvm_sigmask))
2767                                 goto out;
2768                         r = -EINVAL;
2769                         if (kvm_sigmask.len != sizeof sigset)
2770                                 goto out;
2771                         r = -EFAULT;
2772                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2773                                            sizeof sigset))
2774                                 goto out;
2775                         p = &sigset;
2776                 }
2777                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2778                 break;
2779         }
2780         case KVM_GET_FPU: {
2781                 struct kvm_fpu fpu;
2782
2783                 memset(&fpu, 0, sizeof fpu);
2784                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2785                 if (r)
2786                         goto out;
2787                 r = -EFAULT;
2788                 if (copy_to_user(argp, &fpu, sizeof fpu))
2789                         goto out;
2790                 r = 0;
2791                 break;
2792         }
2793         case KVM_SET_FPU: {
2794                 struct kvm_fpu fpu;
2795
2796                 r = -EFAULT;
2797                 if (copy_from_user(&fpu, argp, sizeof fpu))
2798                         goto out;
2799                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2800                 if (r)
2801                         goto out;
2802                 r = 0;
2803                 break;
2804         }
2805         default:
2806                 ;
2807         }
2808 out:
2809         return r;
2810 }
2811
2812 static long kvm_vm_ioctl(struct file *filp,
2813                            unsigned int ioctl, unsigned long arg)
2814 {
2815         struct kvm *kvm = filp->private_data;
2816         void __user *argp = (void __user *)arg;
2817         int r = -EINVAL;
2818
2819         switch (ioctl) {
2820         case KVM_CREATE_VCPU:
2821                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2822                 if (r < 0)
2823                         goto out;
2824                 break;
2825         case KVM_SET_MEMORY_REGION: {
2826                 struct kvm_memory_region kvm_mem;
2827
2828                 r = -EFAULT;
2829                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2830                         goto out;
2831                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
2832                 if (r)
2833                         goto out;
2834                 break;
2835         }
2836         case KVM_GET_DIRTY_LOG: {
2837                 struct kvm_dirty_log log;
2838
2839                 r = -EFAULT;
2840                 if (copy_from_user(&log, argp, sizeof log))
2841                         goto out;
2842                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2843                 if (r)
2844                         goto out;
2845                 break;
2846         }
2847         case KVM_SET_MEMORY_ALIAS: {
2848                 struct kvm_memory_alias alias;
2849
2850                 r = -EFAULT;
2851                 if (copy_from_user(&alias, argp, sizeof alias))
2852                         goto out;
2853                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2854                 if (r)
2855                         goto out;
2856                 break;
2857         }
2858         default:
2859                 ;
2860         }
2861 out:
2862         return r;
2863 }
2864
2865 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2866                                   unsigned long address,
2867                                   int *type)
2868 {
2869         struct kvm *kvm = vma->vm_file->private_data;
2870         unsigned long pgoff;
2871         struct page *page;
2872
2873         *type = VM_FAULT_MINOR;
2874         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2875         page = gfn_to_page(kvm, pgoff);
2876         if (!page)
2877                 return NOPAGE_SIGBUS;
2878         get_page(page);
2879         return page;
2880 }
2881
2882 static struct vm_operations_struct kvm_vm_vm_ops = {
2883         .nopage = kvm_vm_nopage,
2884 };
2885
2886 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2887 {
2888         vma->vm_ops = &kvm_vm_vm_ops;
2889         return 0;
2890 }
2891
2892 static struct file_operations kvm_vm_fops = {
2893         .release        = kvm_vm_release,
2894         .unlocked_ioctl = kvm_vm_ioctl,
2895         .compat_ioctl   = kvm_vm_ioctl,
2896         .mmap           = kvm_vm_mmap,
2897 };
2898
2899 static int kvm_dev_ioctl_create_vm(void)
2900 {
2901         int fd, r;
2902         struct inode *inode;
2903         struct file *file;
2904         struct kvm *kvm;
2905
2906         inode = kvmfs_inode(&kvm_vm_fops);
2907         if (IS_ERR(inode)) {
2908                 r = PTR_ERR(inode);
2909                 goto out1;
2910         }
2911
2912         kvm = kvm_create_vm();
2913         if (IS_ERR(kvm)) {
2914                 r = PTR_ERR(kvm);
2915                 goto out2;
2916         }
2917
2918         file = kvmfs_file(inode, kvm);
2919         if (IS_ERR(file)) {
2920                 r = PTR_ERR(file);
2921                 goto out3;
2922         }
2923         kvm->filp = file;
2924
2925         r = get_unused_fd();
2926         if (r < 0)
2927                 goto out4;
2928         fd = r;
2929         fd_install(fd, file);
2930
2931         return fd;
2932
2933 out4:
2934         fput(file);
2935 out3:
2936         kvm_destroy_vm(kvm);
2937 out2:
2938         iput(inode);
2939 out1:
2940         return r;
2941 }
2942
2943 static long kvm_dev_ioctl(struct file *filp,
2944                           unsigned int ioctl, unsigned long arg)
2945 {
2946         void __user *argp = (void __user *)arg;
2947         long r = -EINVAL;
2948
2949         switch (ioctl) {
2950         case KVM_GET_API_VERSION:
2951                 r = -EINVAL;
2952                 if (arg)
2953                         goto out;
2954                 r = KVM_API_VERSION;
2955                 break;
2956         case KVM_CREATE_VM:
2957                 r = -EINVAL;
2958                 if (arg)
2959                         goto out;
2960                 r = kvm_dev_ioctl_create_vm();
2961                 break;
2962         case KVM_GET_MSR_INDEX_LIST: {
2963                 struct kvm_msr_list __user *user_msr_list = argp;
2964                 struct kvm_msr_list msr_list;
2965                 unsigned n;
2966
2967                 r = -EFAULT;
2968                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2969                         goto out;
2970                 n = msr_list.nmsrs;
2971                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2972                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2973                         goto out;
2974                 r = -E2BIG;
2975                 if (n < num_msrs_to_save)
2976                         goto out;
2977                 r = -EFAULT;
2978                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2979                                  num_msrs_to_save * sizeof(u32)))
2980                         goto out;
2981                 if (copy_to_user(user_msr_list->indices
2982                                  + num_msrs_to_save * sizeof(u32),
2983                                  &emulated_msrs,
2984                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2985                         goto out;
2986                 r = 0;
2987                 break;
2988         }
2989         case KVM_CHECK_EXTENSION:
2990                 /*
2991                  * No extensions defined at present.
2992                  */
2993                 r = 0;
2994                 break;
2995         case KVM_GET_VCPU_MMAP_SIZE:
2996                 r = -EINVAL;
2997                 if (arg)
2998                         goto out;
2999                 r = 2 * PAGE_SIZE;
3000                 break;
3001         default:
3002                 ;
3003         }
3004 out:
3005         return r;
3006 }
3007
3008 static struct file_operations kvm_chardev_ops = {
3009         .open           = kvm_dev_open,
3010         .release        = kvm_dev_release,
3011         .unlocked_ioctl = kvm_dev_ioctl,
3012         .compat_ioctl   = kvm_dev_ioctl,
3013 };
3014
3015 static struct miscdevice kvm_dev = {
3016         KVM_MINOR,
3017         "kvm",
3018         &kvm_chardev_ops,
3019 };
3020
3021 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3022                        void *v)
3023 {
3024         if (val == SYS_RESTART) {
3025                 /*
3026                  * Some (well, at least mine) BIOSes hang on reboot if
3027                  * in vmx root mode.
3028                  */
3029                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3030                 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3031         }
3032         return NOTIFY_OK;
3033 }
3034
3035 static struct notifier_block kvm_reboot_notifier = {
3036         .notifier_call = kvm_reboot,
3037         .priority = 0,
3038 };
3039
3040 /*
3041  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3042  * cached on it.
3043  */
3044 static void decache_vcpus_on_cpu(int cpu)
3045 {
3046         struct kvm *vm;
3047         struct kvm_vcpu *vcpu;
3048         int i;
3049
3050         spin_lock(&kvm_lock);
3051         list_for_each_entry(vm, &vm_list, vm_list)
3052                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3053                         vcpu = &vm->vcpus[i];
3054                         /*
3055                          * If the vcpu is locked, then it is running on some
3056                          * other cpu and therefore it is not cached on the
3057                          * cpu in question.
3058                          *
3059                          * If it's not locked, check the last cpu it executed
3060                          * on.
3061                          */
3062                         if (mutex_trylock(&vcpu->mutex)) {
3063                                 if (vcpu->cpu == cpu) {
3064                                         kvm_arch_ops->vcpu_decache(vcpu);
3065                                         vcpu->cpu = -1;
3066                                 }
3067                                 mutex_unlock(&vcpu->mutex);
3068                         }
3069                 }
3070         spin_unlock(&kvm_lock);
3071 }
3072
3073 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3074                            void *v)
3075 {
3076         int cpu = (long)v;
3077
3078         switch (val) {
3079         case CPU_DOWN_PREPARE:
3080         case CPU_DOWN_PREPARE_FROZEN:
3081         case CPU_UP_CANCELED:
3082         case CPU_UP_CANCELED_FROZEN:
3083                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3084                        cpu);
3085                 decache_vcpus_on_cpu(cpu);
3086                 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
3087                                          NULL, 0, 1);
3088                 break;
3089         case CPU_ONLINE:
3090         case CPU_ONLINE_FROZEN:
3091                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3092                        cpu);
3093                 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
3094                                          NULL, 0, 1);
3095                 break;
3096         }
3097         return NOTIFY_OK;
3098 }
3099
3100 void kvm_io_bus_init(struct kvm_io_bus *bus)
3101 {
3102         memset(bus, 0, sizeof(*bus));
3103 }
3104
3105 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3106 {
3107         int i;
3108
3109         for (i = 0; i < bus->dev_count; i++) {
3110                 struct kvm_io_device *pos = bus->devs[i];
3111
3112                 kvm_iodevice_destructor(pos);
3113         }
3114 }
3115
3116 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3117 {
3118         int i;
3119
3120         for (i = 0; i < bus->dev_count; i++) {
3121                 struct kvm_io_device *pos = bus->devs[i];
3122
3123                 if (pos->in_range(pos, addr))
3124                         return pos;
3125         }
3126
3127         return NULL;
3128 }
3129
3130 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3131 {
3132         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3133
3134         bus->devs[bus->dev_count++] = dev;
3135 }
3136
3137 static struct notifier_block kvm_cpu_notifier = {
3138         .notifier_call = kvm_cpu_hotplug,
3139         .priority = 20, /* must be > scheduler priority */
3140 };
3141
3142 static u64 stat_get(void *_offset)
3143 {
3144         unsigned offset = (long)_offset;
3145         u64 total = 0;
3146         struct kvm *kvm;
3147         struct kvm_vcpu *vcpu;
3148         int i;
3149
3150         spin_lock(&kvm_lock);
3151         list_for_each_entry(kvm, &vm_list, vm_list)
3152                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3153                         vcpu = &kvm->vcpus[i];
3154                         total += *(u32 *)((void *)vcpu + offset);
3155                 }
3156         spin_unlock(&kvm_lock);
3157         return total;
3158 }
3159
3160 static void stat_set(void *offset, u64 val)
3161 {
3162 }
3163
3164 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
3165
3166 static __init void kvm_init_debug(void)
3167 {
3168         struct kvm_stats_debugfs_item *p;
3169
3170         debugfs_dir = debugfs_create_dir("kvm", NULL);
3171         for (p = debugfs_entries; p->name; ++p)
3172                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3173                                                 (void *)(long)p->offset,
3174                                                 &stat_fops);
3175 }
3176
3177 static void kvm_exit_debug(void)
3178 {
3179         struct kvm_stats_debugfs_item *p;
3180
3181         for (p = debugfs_entries; p->name; ++p)
3182                 debugfs_remove(p->dentry);
3183         debugfs_remove(debugfs_dir);
3184 }
3185
3186 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3187 {
3188         decache_vcpus_on_cpu(raw_smp_processor_id());
3189         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3190         return 0;
3191 }
3192
3193 static int kvm_resume(struct sys_device *dev)
3194 {
3195         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3196         return 0;
3197 }
3198
3199 static struct sysdev_class kvm_sysdev_class = {
3200         set_kset_name("kvm"),
3201         .suspend = kvm_suspend,
3202         .resume = kvm_resume,
3203 };
3204
3205 static struct sys_device kvm_sysdev = {
3206         .id = 0,
3207         .cls = &kvm_sysdev_class,
3208 };
3209
3210 hpa_t bad_page_address;
3211
3212 static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
3213                         const char *dev_name, void *data, struct vfsmount *mnt)
3214 {
3215         return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
3216 }
3217
3218 static struct file_system_type kvm_fs_type = {
3219         .name           = "kvmfs",
3220         .get_sb         = kvmfs_get_sb,
3221         .kill_sb        = kill_anon_super,
3222 };
3223
3224 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3225 {
3226         int r;
3227
3228         if (kvm_arch_ops) {
3229                 printk(KERN_ERR "kvm: already loaded the other module\n");
3230                 return -EEXIST;
3231         }
3232
3233         if (!ops->cpu_has_kvm_support()) {
3234                 printk(KERN_ERR "kvm: no hardware support\n");
3235                 return -EOPNOTSUPP;
3236         }
3237         if (ops->disabled_by_bios()) {
3238                 printk(KERN_ERR "kvm: disabled by bios\n");
3239                 return -EOPNOTSUPP;
3240         }
3241
3242         kvm_arch_ops = ops;
3243
3244         r = kvm_arch_ops->hardware_setup();
3245         if (r < 0)
3246                 goto out;
3247
3248         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
3249         r = register_cpu_notifier(&kvm_cpu_notifier);
3250         if (r)
3251                 goto out_free_1;
3252         register_reboot_notifier(&kvm_reboot_notifier);
3253
3254         r = sysdev_class_register(&kvm_sysdev_class);
3255         if (r)
3256                 goto out_free_2;
3257
3258         r = sysdev_register(&kvm_sysdev);
3259         if (r)
3260                 goto out_free_3;
3261
3262         kvm_chardev_ops.owner = module;
3263
3264         r = misc_register(&kvm_dev);
3265         if (r) {
3266                 printk (KERN_ERR "kvm: misc device register failed\n");
3267                 goto out_free;
3268         }
3269
3270         return r;
3271
3272 out_free:
3273         sysdev_unregister(&kvm_sysdev);
3274 out_free_3:
3275         sysdev_class_unregister(&kvm_sysdev_class);
3276 out_free_2:
3277         unregister_reboot_notifier(&kvm_reboot_notifier);
3278         unregister_cpu_notifier(&kvm_cpu_notifier);
3279 out_free_1:
3280         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3281         kvm_arch_ops->hardware_unsetup();
3282 out:
3283         kvm_arch_ops = NULL;
3284         return r;
3285 }
3286
3287 void kvm_exit_arch(void)
3288 {
3289         misc_deregister(&kvm_dev);
3290         sysdev_unregister(&kvm_sysdev);
3291         sysdev_class_unregister(&kvm_sysdev_class);
3292         unregister_reboot_notifier(&kvm_reboot_notifier);
3293         unregister_cpu_notifier(&kvm_cpu_notifier);
3294         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
3295         kvm_arch_ops->hardware_unsetup();
3296         kvm_arch_ops = NULL;
3297 }
3298
3299 static __init int kvm_init(void)
3300 {
3301         static struct page *bad_page;
3302         int r;
3303
3304         r = kvm_mmu_module_init();
3305         if (r)
3306                 goto out4;
3307
3308         r = register_filesystem(&kvm_fs_type);
3309         if (r)
3310                 goto out3;
3311
3312         kvmfs_mnt = kern_mount(&kvm_fs_type);
3313         r = PTR_ERR(kvmfs_mnt);
3314         if (IS_ERR(kvmfs_mnt))
3315                 goto out2;
3316         kvm_init_debug();
3317
3318         kvm_init_msr_list();
3319
3320         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3321                 r = -ENOMEM;
3322                 goto out;
3323         }
3324
3325         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3326         memset(__va(bad_page_address), 0, PAGE_SIZE);
3327
3328         return 0;
3329
3330 out:
3331         kvm_exit_debug();
3332         mntput(kvmfs_mnt);
3333 out2:
3334         unregister_filesystem(&kvm_fs_type);
3335 out3:
3336         kvm_mmu_module_exit();
3337 out4:
3338         return r;
3339 }
3340
3341 static __exit void kvm_exit(void)
3342 {
3343         kvm_exit_debug();
3344         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3345         mntput(kvmfs_mnt);
3346         unregister_filesystem(&kvm_fs_type);
3347         kvm_mmu_module_exit();
3348 }
3349
3350 module_init(kvm_init)
3351 module_exit(kvm_exit)
3352
3353 EXPORT_SYMBOL_GPL(kvm_init_arch);
3354 EXPORT_SYMBOL_GPL(kvm_exit_arch);