mmu.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500
  1. /*
  2. * Copyright (C) 2012 - Virtual Open Systems and Columbia University
  3. * Author: Christoffer Dall <c.dall@virtualopensystems.com>
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License, version 2, as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  17. */
  18. #include <linux/mman.h>
  19. #include <linux/kvm_host.h>
  20. #include <linux/io.h>
  21. #include <linux/hugetlb.h>
  22. #include <trace/events/kvm.h>
  23. #include <asm/pgalloc.h>
  24. #include <asm/cacheflush.h>
  25. #include <asm/kvm_arm.h>
  26. #include <asm/kvm_mmu.h>
  27. #include <asm/kvm_mmio.h>
  28. #include <asm/kvm_asm.h>
  29. #include <asm/kvm_emulate.h>
  30. #include "trace.h"
  31. extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
  32. static pgd_t *boot_hyp_pgd;
  33. static pgd_t *hyp_pgd;
  34. static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
  35. static void *init_bounce_page;
  36. static unsigned long hyp_idmap_start;
  37. static unsigned long hyp_idmap_end;
  38. static phys_addr_t hyp_idmap_vector;
  39. #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
  40. #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x))
  41. static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
  42. {
  43. /*
  44. * This function also gets called when dealing with HYP page
  45. * tables. As HYP doesn't have an associated struct kvm (and
  46. * the HYP page tables are fairly static), we don't do
  47. * anything there.
  48. */
  49. if (kvm)
  50. kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
  51. }
  52. /*
  53. * D-Cache management functions. They take the page table entries by
  54. * value, as they are flushing the cache using the kernel mapping (or
  55. * kmap on 32bit).
  56. */
  57. static void kvm_flush_dcache_pte(pte_t pte)
  58. {
  59. __kvm_flush_dcache_pte(pte);
  60. }
  61. static void kvm_flush_dcache_pmd(pmd_t pmd)
  62. {
  63. __kvm_flush_dcache_pmd(pmd);
  64. }
  65. static void kvm_flush_dcache_pud(pud_t pud)
  66. {
  67. __kvm_flush_dcache_pud(pud);
  68. }
  69. static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  70. int min, int max)
  71. {
  72. void *page;
  73. BUG_ON(max > KVM_NR_MEM_OBJS);
  74. if (cache->nobjs >= min)
  75. return 0;
  76. while (cache->nobjs < max) {
  77. page = (void *)__get_free_page(PGALLOC_GFP);
  78. if (!page)
  79. return -ENOMEM;
  80. cache->objects[cache->nobjs++] = page;
  81. }
  82. return 0;
  83. }
  84. static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
  85. {
  86. while (mc->nobjs)
  87. free_page((unsigned long)mc->objects[--mc->nobjs]);
  88. }
  89. static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
  90. {
  91. void *p;
  92. BUG_ON(!mc || !mc->nobjs);
  93. p = mc->objects[--mc->nobjs];
  94. return p;
  95. }
  96. static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
  97. {
  98. pud_t *pud_table __maybe_unused = pud_offset(pgd, 0);
  99. pgd_clear(pgd);
  100. kvm_tlb_flush_vmid_ipa(kvm, addr);
  101. pud_free(NULL, pud_table);
  102. put_page(virt_to_page(pgd));
  103. }
  104. static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
  105. {
  106. pmd_t *pmd_table = pmd_offset(pud, 0);
  107. VM_BUG_ON(pud_huge(*pud));
  108. pud_clear(pud);
  109. kvm_tlb_flush_vmid_ipa(kvm, addr);
  110. pmd_free(NULL, pmd_table);
  111. put_page(virt_to_page(pud));
  112. }
  113. static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
  114. {
  115. pte_t *pte_table = pte_offset_kernel(pmd, 0);
  116. VM_BUG_ON(kvm_pmd_huge(*pmd));
  117. pmd_clear(pmd);
  118. kvm_tlb_flush_vmid_ipa(kvm, addr);
  119. pte_free_kernel(NULL, pte_table);
  120. put_page(virt_to_page(pmd));
  121. }
  122. /*
  123. * Unmapping vs dcache management:
  124. *
  125. * If a guest maps certain memory pages as uncached, all writes will
  126. * bypass the data cache and go directly to RAM. However, the CPUs
  127. * can still speculate reads (not writes) and fill cache lines with
  128. * data.
  129. *
  130. * Those cache lines will be *clean* cache lines though, so a
  131. * clean+invalidate operation is equivalent to an invalidate
  132. * operation, because no cache lines are marked dirty.
  133. *
  134. * Those clean cache lines could be filled prior to an uncached write
  135. * by the guest, and the cache coherent IO subsystem would therefore
  136. * end up writing old data to disk.
  137. *
  138. * This is why right after unmapping a page/section and invalidating
  139. * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
  140. * the IO subsystem will never hit in the cache.
  141. */
  142. static void unmap_ptes(struct kvm *kvm, pmd_t *pmd,
  143. phys_addr_t addr, phys_addr_t end)
  144. {
  145. phys_addr_t start_addr = addr;
  146. pte_t *pte, *start_pte;
  147. start_pte = pte = pte_offset_kernel(pmd, addr);
  148. do {
  149. if (!pte_none(*pte)) {
  150. pte_t old_pte = *pte;
  151. kvm_set_pte(pte, __pte(0));
  152. kvm_tlb_flush_vmid_ipa(kvm, addr);
  153. /* No need to invalidate the cache for device mappings */
  154. if ((pte_val(old_pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE)
  155. kvm_flush_dcache_pte(old_pte);
  156. put_page(virt_to_page(pte));
  157. }
  158. } while (pte++, addr += PAGE_SIZE, addr != end);
  159. if (kvm_pte_table_empty(kvm, start_pte))
  160. clear_pmd_entry(kvm, pmd, start_addr);
  161. }
  162. static void unmap_pmds(struct kvm *kvm, pud_t *pud,
  163. phys_addr_t addr, phys_addr_t end)
  164. {
  165. phys_addr_t next, start_addr = addr;
  166. pmd_t *pmd, *start_pmd;
  167. start_pmd = pmd = pmd_offset(pud, addr);
  168. do {
  169. next = kvm_pmd_addr_end(addr, end);
  170. if (!pmd_none(*pmd)) {
  171. if (kvm_pmd_huge(*pmd)) {
  172. pmd_t old_pmd = *pmd;
  173. pmd_clear(pmd);
  174. kvm_tlb_flush_vmid_ipa(kvm, addr);
  175. kvm_flush_dcache_pmd(old_pmd);
  176. put_page(virt_to_page(pmd));
  177. } else {
  178. unmap_ptes(kvm, pmd, addr, next);
  179. }
  180. }
  181. } while (pmd++, addr = next, addr != end);
  182. if (kvm_pmd_table_empty(kvm, start_pmd))
  183. clear_pud_entry(kvm, pud, start_addr);
  184. }
  185. static void unmap_puds(struct kvm *kvm, pgd_t *pgd,
  186. phys_addr_t addr, phys_addr_t end)
  187. {
  188. phys_addr_t next, start_addr = addr;
  189. pud_t *pud, *start_pud;
  190. start_pud = pud = pud_offset(pgd, addr);
  191. do {
  192. next = kvm_pud_addr_end(addr, end);
  193. if (!pud_none(*pud)) {
  194. if (pud_huge(*pud)) {
  195. pud_t old_pud = *pud;
  196. pud_clear(pud);
  197. kvm_tlb_flush_vmid_ipa(kvm, addr);
  198. kvm_flush_dcache_pud(old_pud);
  199. put_page(virt_to_page(pud));
  200. } else {
  201. unmap_pmds(kvm, pud, addr, next);
  202. }
  203. }
  204. } while (pud++, addr = next, addr != end);
  205. if (kvm_pud_table_empty(kvm, start_pud))
  206. clear_pgd_entry(kvm, pgd, start_addr);
  207. }
  208. static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
  209. phys_addr_t start, u64 size)
  210. {
  211. pgd_t *pgd;
  212. phys_addr_t addr = start, end = start + size;
  213. phys_addr_t next;
  214. pgd = pgdp + kvm_pgd_index(addr);
  215. do {
  216. next = kvm_pgd_addr_end(addr, end);
  217. if (!pgd_none(*pgd))
  218. unmap_puds(kvm, pgd, addr, next);
  219. } while (pgd++, addr = next, addr != end);
  220. }
  221. static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
  222. phys_addr_t addr, phys_addr_t end)
  223. {
  224. pte_t *pte;
  225. pte = pte_offset_kernel(pmd, addr);
  226. do {
  227. if (!pte_none(*pte) &&
  228. (pte_val(*pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE)
  229. kvm_flush_dcache_pte(*pte);
  230. } while (pte++, addr += PAGE_SIZE, addr != end);
  231. }
  232. static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
  233. phys_addr_t addr, phys_addr_t end)
  234. {
  235. pmd_t *pmd;
  236. phys_addr_t next;
  237. pmd = pmd_offset(pud, addr);
  238. do {
  239. next = kvm_pmd_addr_end(addr, end);
  240. if (!pmd_none(*pmd)) {
  241. if (kvm_pmd_huge(*pmd))
  242. kvm_flush_dcache_pmd(*pmd);
  243. else
  244. stage2_flush_ptes(kvm, pmd, addr, next);
  245. }
  246. } while (pmd++, addr = next, addr != end);
  247. }
  248. static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
  249. phys_addr_t addr, phys_addr_t end)
  250. {
  251. pud_t *pud;
  252. phys_addr_t next;
  253. pud = pud_offset(pgd, addr);
  254. do {
  255. next = kvm_pud_addr_end(addr, end);
  256. if (!pud_none(*pud)) {
  257. if (pud_huge(*pud))
  258. kvm_flush_dcache_pud(*pud);
  259. else
  260. stage2_flush_pmds(kvm, pud, addr, next);
  261. }
  262. } while (pud++, addr = next, addr != end);
  263. }
  264. static void stage2_flush_memslot(struct kvm *kvm,
  265. struct kvm_memory_slot *memslot)
  266. {
  267. phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
  268. phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
  269. phys_addr_t next;
  270. pgd_t *pgd;
  271. pgd = kvm->arch.pgd + kvm_pgd_index(addr);
  272. do {
  273. next = kvm_pgd_addr_end(addr, end);
  274. stage2_flush_puds(kvm, pgd, addr, next);
  275. } while (pgd++, addr = next, addr != end);
  276. }
  277. /**
  278. * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
  279. * @kvm: The struct kvm pointer
  280. *
  281. * Go through the stage 2 page tables and invalidate any cache lines
  282. * backing memory already mapped to the VM.
  283. */
  284. void stage2_flush_vm(struct kvm *kvm)
  285. {
  286. struct kvm_memslots *slots;
  287. struct kvm_memory_slot *memslot;
  288. int idx;
  289. idx = srcu_read_lock(&kvm->srcu);
  290. spin_lock(&kvm->mmu_lock);
  291. slots = kvm_memslots(kvm);
  292. kvm_for_each_memslot(memslot, slots)
  293. stage2_flush_memslot(kvm, memslot);
  294. spin_unlock(&kvm->mmu_lock);
  295. srcu_read_unlock(&kvm->srcu, idx);
  296. }
  297. /**
  298. * free_boot_hyp_pgd - free HYP boot page tables
  299. *
  300. * Free the HYP boot page tables. The bounce page is also freed.
  301. */
  302. void free_boot_hyp_pgd(void)
  303. {
  304. mutex_lock(&kvm_hyp_pgd_mutex);
  305. if (boot_hyp_pgd) {
  306. unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
  307. unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
  308. free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
  309. boot_hyp_pgd = NULL;
  310. }
  311. if (hyp_pgd)
  312. unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
  313. free_page((unsigned long)init_bounce_page);
  314. init_bounce_page = NULL;
  315. mutex_unlock(&kvm_hyp_pgd_mutex);
  316. }
  317. /**
  318. * free_hyp_pgds - free Hyp-mode page tables
  319. *
  320. * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
  321. * therefore contains either mappings in the kernel memory area (above
  322. * PAGE_OFFSET), or device mappings in the vmalloc range (from
  323. * VMALLOC_START to VMALLOC_END).
  324. *
  325. * boot_hyp_pgd should only map two pages for the init code.
  326. */
  327. void free_hyp_pgds(void)
  328. {
  329. unsigned long addr;
  330. free_boot_hyp_pgd();
  331. mutex_lock(&kvm_hyp_pgd_mutex);
  332. if (hyp_pgd) {
  333. for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
  334. unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
  335. for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
  336. unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
  337. free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
  338. hyp_pgd = NULL;
  339. }
  340. mutex_unlock(&kvm_hyp_pgd_mutex);
  341. }
  342. static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
  343. unsigned long end, unsigned long pfn,
  344. pgprot_t prot)
  345. {
  346. pte_t *pte;
  347. unsigned long addr;
  348. addr = start;
  349. do {
  350. pte = pte_offset_kernel(pmd, addr);
  351. kvm_set_pte(pte, pfn_pte(pfn, prot));
  352. get_page(virt_to_page(pte));
  353. kvm_flush_dcache_to_poc(pte, sizeof(*pte));
  354. pfn++;
  355. } while (addr += PAGE_SIZE, addr != end);
  356. }
  357. static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
  358. unsigned long end, unsigned long pfn,
  359. pgprot_t prot)
  360. {
  361. pmd_t *pmd;
  362. pte_t *pte;
  363. unsigned long addr, next;
  364. addr = start;
  365. do {
  366. pmd = pmd_offset(pud, addr);
  367. BUG_ON(pmd_sect(*pmd));
  368. if (pmd_none(*pmd)) {
  369. pte = pte_alloc_one_kernel(NULL, addr);
  370. if (!pte) {
  371. kvm_err("Cannot allocate Hyp pte\n");
  372. return -ENOMEM;
  373. }
  374. pmd_populate_kernel(NULL, pmd, pte);
  375. get_page(virt_to_page(pmd));
  376. kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
  377. }
  378. next = pmd_addr_end(addr, end);
  379. create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
  380. pfn += (next - addr) >> PAGE_SHIFT;
  381. } while (addr = next, addr != end);
  382. return 0;
  383. }
  384. static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
  385. unsigned long end, unsigned long pfn,
  386. pgprot_t prot)
  387. {
  388. pud_t *pud;
  389. pmd_t *pmd;
  390. unsigned long addr, next;
  391. int ret;
  392. addr = start;
  393. do {
  394. pud = pud_offset(pgd, addr);
  395. if (pud_none_or_clear_bad(pud)) {
  396. pmd = pmd_alloc_one(NULL, addr);
  397. if (!pmd) {
  398. kvm_err("Cannot allocate Hyp pmd\n");
  399. return -ENOMEM;
  400. }
  401. pud_populate(NULL, pud, pmd);
  402. get_page(virt_to_page(pud));
  403. kvm_flush_dcache_to_poc(pud, sizeof(*pud));
  404. }
  405. next = pud_addr_end(addr, end);
  406. ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
  407. if (ret)
  408. return ret;
  409. pfn += (next - addr) >> PAGE_SHIFT;
  410. } while (addr = next, addr != end);
  411. return 0;
  412. }
  413. static int __create_hyp_mappings(pgd_t *pgdp,
  414. unsigned long start, unsigned long end,
  415. unsigned long pfn, pgprot_t prot)
  416. {
  417. pgd_t *pgd;
  418. pud_t *pud;
  419. unsigned long addr, next;
  420. int err = 0;
  421. mutex_lock(&kvm_hyp_pgd_mutex);
  422. addr = start & PAGE_MASK;
  423. end = PAGE_ALIGN(end);
  424. do {
  425. pgd = pgdp + pgd_index(addr);
  426. if (pgd_none(*pgd)) {
  427. pud = pud_alloc_one(NULL, addr);
  428. if (!pud) {
  429. kvm_err("Cannot allocate Hyp pud\n");
  430. err = -ENOMEM;
  431. goto out;
  432. }
  433. pgd_populate(NULL, pgd, pud);
  434. get_page(virt_to_page(pgd));
  435. kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
  436. }
  437. next = pgd_addr_end(addr, end);
  438. err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
  439. if (err)
  440. goto out;
  441. pfn += (next - addr) >> PAGE_SHIFT;
  442. } while (addr = next, addr != end);
  443. out:
  444. mutex_unlock(&kvm_hyp_pgd_mutex);
  445. return err;
  446. }
  447. static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
  448. {
  449. if (!is_vmalloc_addr(kaddr)) {
  450. BUG_ON(!virt_addr_valid(kaddr));
  451. return __pa(kaddr);
  452. } else {
  453. return page_to_phys(vmalloc_to_page(kaddr)) +
  454. offset_in_page(kaddr);
  455. }
  456. }
  457. /**
  458. * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
  459. * @from: The virtual kernel start address of the range
  460. * @to: The virtual kernel end address of the range (exclusive)
  461. *
  462. * The same virtual address as the kernel virtual address is also used
  463. * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  464. * physical pages.
  465. */
  466. int create_hyp_mappings(void *from, void *to)
  467. {
  468. phys_addr_t phys_addr;
  469. unsigned long virt_addr;
  470. unsigned long start = KERN_TO_HYP((unsigned long)from);
  471. unsigned long end = KERN_TO_HYP((unsigned long)to);
  472. start = start & PAGE_MASK;
  473. end = PAGE_ALIGN(end);
  474. for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
  475. int err;
  476. phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
  477. err = __create_hyp_mappings(hyp_pgd, virt_addr,
  478. virt_addr + PAGE_SIZE,
  479. __phys_to_pfn(phys_addr),
  480. PAGE_HYP);
  481. if (err)
  482. return err;
  483. }
  484. return 0;
  485. }
  486. /**
  487. * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
  488. * @from: The kernel start VA of the range
  489. * @to: The kernel end VA of the range (exclusive)
  490. * @phys_addr: The physical start address which gets mapped
  491. *
  492. * The resulting HYP VA is the same as the kernel VA, modulo
  493. * HYP_PAGE_OFFSET.
  494. */
  495. int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
  496. {
  497. unsigned long start = KERN_TO_HYP((unsigned long)from);
  498. unsigned long end = KERN_TO_HYP((unsigned long)to);
  499. /* Check for a valid kernel IO mapping */
  500. if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
  501. return -EINVAL;
  502. return __create_hyp_mappings(hyp_pgd, start, end,
  503. __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
  504. }
  505. /* Free the HW pgd, one page at a time */
  506. static void kvm_free_hwpgd(void *hwpgd)
  507. {
  508. free_pages_exact(hwpgd, kvm_get_hwpgd_size());
  509. }
  510. /* Allocate the HW PGD, making sure that each page gets its own refcount */
  511. static void *kvm_alloc_hwpgd(void)
  512. {
  513. unsigned int size = kvm_get_hwpgd_size();
  514. return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
  515. }
  516. /**
  517. * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
  518. * @kvm: The KVM struct pointer for the VM.
  519. *
  520. * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
  521. * support either full 40-bit input addresses or limited to 32-bit input
  522. * addresses). Clears the allocated pages.
  523. *
  524. * Note we don't need locking here as this is only called when the VM is
  525. * created, which can only be done once.
  526. */
  527. int kvm_alloc_stage2_pgd(struct kvm *kvm)
  528. {
  529. pgd_t *pgd;
  530. void *hwpgd;
  531. if (kvm->arch.pgd != NULL) {
  532. kvm_err("kvm_arch already initialized?\n");
  533. return -EINVAL;
  534. }
  535. hwpgd = kvm_alloc_hwpgd();
  536. if (!hwpgd)
  537. return -ENOMEM;
  538. /* When the kernel uses more levels of page tables than the
  539. * guest, we allocate a fake PGD and pre-populate it to point
  540. * to the next-level page table, which will be the real
  541. * initial page table pointed to by the VTTBR.
  542. *
  543. * When KVM_PREALLOC_LEVEL==2, we allocate a single page for
  544. * the PMD and the kernel will use folded pud.
  545. * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
  546. * pages.
  547. */
  548. if (KVM_PREALLOC_LEVEL > 0) {
  549. int i;
  550. /*
  551. * Allocate fake pgd for the page table manipulation macros to
  552. * work. This is not used by the hardware and we have no
  553. * alignment requirement for this allocation.
  554. */
  555. pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
  556. GFP_KERNEL | __GFP_ZERO);
  557. if (!pgd) {
  558. kvm_free_hwpgd(hwpgd);
  559. return -ENOMEM;
  560. }
  561. /* Plug the HW PGD into the fake one. */
  562. for (i = 0; i < PTRS_PER_S2_PGD; i++) {
  563. if (KVM_PREALLOC_LEVEL == 1)
  564. pgd_populate(NULL, pgd + i,
  565. (pud_t *)hwpgd + i * PTRS_PER_PUD);
  566. else if (KVM_PREALLOC_LEVEL == 2)
  567. pud_populate(NULL, pud_offset(pgd, 0) + i,
  568. (pmd_t *)hwpgd + i * PTRS_PER_PMD);
  569. }
  570. } else {
  571. /*
  572. * Allocate actual first-level Stage-2 page table used by the
  573. * hardware for Stage-2 page table walks.
  574. */
  575. pgd = (pgd_t *)hwpgd;
  576. }
  577. kvm_clean_pgd(pgd);
  578. kvm->arch.pgd = pgd;
  579. return 0;
  580. }
  581. /**
  582. * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
  583. * @kvm: The VM pointer
  584. * @start: The intermediate physical base address of the range to unmap
  585. * @size: The size of the area to unmap
  586. *
  587. * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
  588. * be called while holding mmu_lock (unless for freeing the stage2 pgd before
  589. * destroying the VM), otherwise another faulting VCPU may come in and mess
  590. * with things behind our backs.
  591. */
  592. static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
  593. {
  594. unmap_range(kvm, kvm->arch.pgd, start, size);
  595. }
  596. static void stage2_unmap_memslot(struct kvm *kvm,
  597. struct kvm_memory_slot *memslot)
  598. {
  599. hva_t hva = memslot->userspace_addr;
  600. phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
  601. phys_addr_t size = PAGE_SIZE * memslot->npages;
  602. hva_t reg_end = hva + size;
  603. /*
  604. * A memory region could potentially cover multiple VMAs, and any holes
  605. * between them, so iterate over all of them to find out if we should
  606. * unmap any of them.
  607. *
  608. * +--------------------------------------------+
  609. * +---------------+----------------+ +----------------+
  610. * | : VMA 1 | VMA 2 | | VMA 3 : |
  611. * +---------------+----------------+ +----------------+
  612. * | memory region |
  613. * +--------------------------------------------+
  614. */
  615. do {
  616. struct vm_area_struct *vma = find_vma(current->mm, hva);
  617. hva_t vm_start, vm_end;
  618. if (!vma || vma->vm_start >= reg_end)
  619. break;
  620. /*
  621. * Take the intersection of this VMA with the memory region
  622. */
  623. vm_start = max(hva, vma->vm_start);
  624. vm_end = min(reg_end, vma->vm_end);
  625. if (!(vma->vm_flags & VM_PFNMAP)) {
  626. gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
  627. unmap_stage2_range(kvm, gpa, vm_end - vm_start);
  628. }
  629. hva = vm_end;
  630. } while (hva < reg_end);
  631. }
  632. /**
  633. * stage2_unmap_vm - Unmap Stage-2 RAM mappings
  634. * @kvm: The struct kvm pointer
  635. *
  636. * Go through the memregions and unmap any reguler RAM
  637. * backing memory already mapped to the VM.
  638. */
  639. void stage2_unmap_vm(struct kvm *kvm)
  640. {
  641. struct kvm_memslots *slots;
  642. struct kvm_memory_slot *memslot;
  643. int idx;
  644. idx = srcu_read_lock(&kvm->srcu);
  645. spin_lock(&kvm->mmu_lock);
  646. slots = kvm_memslots(kvm);
  647. kvm_for_each_memslot(memslot, slots)
  648. stage2_unmap_memslot(kvm, memslot);
  649. spin_unlock(&kvm->mmu_lock);
  650. srcu_read_unlock(&kvm->srcu, idx);
  651. }
  652. /**
  653. * kvm_free_stage2_pgd - free all stage-2 tables
  654. * @kvm: The KVM struct pointer for the VM.
  655. *
  656. * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
  657. * underlying level-2 and level-3 tables before freeing the actual level-1 table
  658. * and setting the struct pointer to NULL.
  659. *
  660. * Note we don't need locking here as this is only called when the VM is
  661. * destroyed, which can only be done once.
  662. */
  663. void kvm_free_stage2_pgd(struct kvm *kvm)
  664. {
  665. if (kvm->arch.pgd == NULL)
  666. return;
  667. unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
  668. kvm_free_hwpgd(kvm_get_hwpgd(kvm));
  669. if (KVM_PREALLOC_LEVEL > 0)
  670. kfree(kvm->arch.pgd);
  671. kvm->arch.pgd = NULL;
  672. }
  673. static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
  674. phys_addr_t addr)
  675. {
  676. pgd_t *pgd;
  677. pud_t *pud;
  678. pgd = kvm->arch.pgd + kvm_pgd_index(addr);
  679. if (WARN_ON(pgd_none(*pgd))) {
  680. if (!cache)
  681. return NULL;
  682. pud = mmu_memory_cache_alloc(cache);
  683. pgd_populate(NULL, pgd, pud);
  684. get_page(virt_to_page(pgd));
  685. }
  686. return pud_offset(pgd, addr);
  687. }
  688. static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
  689. phys_addr_t addr)
  690. {
  691. pud_t *pud;
  692. pmd_t *pmd;
  693. pud = stage2_get_pud(kvm, cache, addr);
  694. if (pud_none(*pud)) {
  695. if (!cache)
  696. return NULL;
  697. pmd = mmu_memory_cache_alloc(cache);
  698. pud_populate(NULL, pud, pmd);
  699. get_page(virt_to_page(pud));
  700. }
  701. return pmd_offset(pud, addr);
  702. }
  703. static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
  704. *cache, phys_addr_t addr, const pmd_t *new_pmd)
  705. {
  706. pmd_t *pmd, old_pmd;
  707. pmd = stage2_get_pmd(kvm, cache, addr);
  708. VM_BUG_ON(!pmd);
  709. /*
  710. * Mapping in huge pages should only happen through a fault. If a
  711. * page is merged into a transparent huge page, the individual
  712. * subpages of that huge page should be unmapped through MMU
  713. * notifiers before we get here.
  714. *
  715. * Merging of CompoundPages is not supported; they should become
  716. * splitting first, unmapped, merged, and mapped back in on-demand.
  717. */
  718. VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
  719. old_pmd = *pmd;
  720. kvm_set_pmd(pmd, *new_pmd);
  721. if (pmd_present(old_pmd))
  722. kvm_tlb_flush_vmid_ipa(kvm, addr);
  723. else
  724. get_page(virt_to_page(pmd));
  725. return 0;
  726. }
  727. static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
  728. phys_addr_t addr, const pte_t *new_pte, bool iomap)
  729. {
  730. pmd_t *pmd;
  731. pte_t *pte, old_pte;
  732. /* Create stage-2 page table mapping - Levels 0 and 1 */
  733. pmd = stage2_get_pmd(kvm, cache, addr);
  734. if (!pmd) {
  735. /*
  736. * Ignore calls from kvm_set_spte_hva for unallocated
  737. * address ranges.
  738. */
  739. return 0;
  740. }
  741. /* Create stage-2 page mappings - Level 2 */
  742. if (pmd_none(*pmd)) {
  743. if (!cache)
  744. return 0; /* ignore calls from kvm_set_spte_hva */
  745. pte = mmu_memory_cache_alloc(cache);
  746. kvm_clean_pte(pte);
  747. pmd_populate_kernel(NULL, pmd, pte);
  748. get_page(virt_to_page(pmd));
  749. }
  750. pte = pte_offset_kernel(pmd, addr);
  751. if (iomap && pte_present(*pte))
  752. return -EFAULT;
  753. /* Create 2nd stage page table mapping - Level 3 */
  754. old_pte = *pte;
  755. kvm_set_pte(pte, *new_pte);
  756. if (pte_present(old_pte))
  757. kvm_tlb_flush_vmid_ipa(kvm, addr);
  758. else
  759. get_page(virt_to_page(pte));
  760. return 0;
  761. }
  762. /**
  763. * kvm_phys_addr_ioremap - map a device range to guest IPA
  764. *
  765. * @kvm: The KVM pointer
  766. * @guest_ipa: The IPA at which to insert the mapping
  767. * @pa: The physical address of the device
  768. * @size: The size of the mapping
  769. */
  770. int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  771. phys_addr_t pa, unsigned long size, bool writable)
  772. {
  773. phys_addr_t addr, end;
  774. int ret = 0;
  775. unsigned long pfn;
  776. struct kvm_mmu_memory_cache cache = { 0, };
  777. end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
  778. pfn = __phys_to_pfn(pa);
  779. for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
  780. pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
  781. if (writable)
  782. kvm_set_s2pte_writable(&pte);
  783. ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
  784. KVM_NR_MEM_OBJS);
  785. if (ret)
  786. goto out;
  787. spin_lock(&kvm->mmu_lock);
  788. ret = stage2_set_pte(kvm, &cache, addr, &pte, true);
  789. spin_unlock(&kvm->mmu_lock);
  790. if (ret)
  791. goto out;
  792. pfn++;
  793. }
  794. out:
  795. mmu_free_memory_cache(&cache);
  796. return ret;
  797. }
  798. static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap)
  799. {
  800. pfn_t pfn = *pfnp;
  801. gfn_t gfn = *ipap >> PAGE_SHIFT;
  802. if (PageTransCompound(pfn_to_page(pfn))) {
  803. unsigned long mask;
  804. /*
  805. * The address we faulted on is backed by a transparent huge
  806. * page. However, because we map the compound huge page and
  807. * not the individual tail page, we need to transfer the
  808. * refcount to the head page. We have to be careful that the
  809. * THP doesn't start to split while we are adjusting the
  810. * refcounts.
  811. *
  812. * We are sure this doesn't happen, because mmu_notifier_retry
  813. * was successful and we are holding the mmu_lock, so if this
  814. * THP is trying to split, it will be blocked in the mmu
  815. * notifier before touching any of the pages, specifically
  816. * before being able to call __split_huge_page_refcount().
  817. *
  818. * We can therefore safely transfer the refcount from PG_tail
  819. * to PG_head and switch the pfn from a tail page to the head
  820. * page accordingly.
  821. */
  822. mask = PTRS_PER_PMD - 1;
  823. VM_BUG_ON((gfn & mask) != (pfn & mask));
  824. if (pfn & mask) {
  825. *ipap &= PMD_MASK;
  826. kvm_release_pfn_clean(pfn);
  827. pfn &= ~mask;
  828. kvm_get_pfn(pfn);
  829. *pfnp = pfn;
  830. }
  831. return true;
  832. }
  833. return false;
  834. }
  835. static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
  836. {
  837. if (kvm_vcpu_trap_is_iabt(vcpu))
  838. return false;
  839. return kvm_vcpu_dabt_iswrite(vcpu);
  840. }
  841. static bool kvm_is_device_pfn(unsigned long pfn)
  842. {
  843. return !pfn_valid(pfn);
  844. }
  845. static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn,
  846. unsigned long size, bool uncached)
  847. {
  848. __coherent_cache_guest_page(vcpu, pfn, size, uncached);
  849. }
  850. static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
  851. struct kvm_memory_slot *memslot, unsigned long hva,
  852. unsigned long fault_status)
  853. {
  854. int ret;
  855. bool write_fault, writable, hugetlb = false, force_pte = false;
  856. unsigned long mmu_seq;
  857. gfn_t gfn = fault_ipa >> PAGE_SHIFT;
  858. struct kvm *kvm = vcpu->kvm;
  859. struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
  860. struct vm_area_struct *vma;
  861. pfn_t pfn;
  862. pgprot_t mem_type = PAGE_S2;
  863. bool fault_ipa_uncached;
  864. write_fault = kvm_is_write_fault(vcpu);
  865. if (fault_status == FSC_PERM && !write_fault) {
  866. kvm_err("Unexpected L2 read permission error\n");
  867. return -EFAULT;
  868. }
  869. /* Let's check if we will get back a huge page backed by hugetlbfs */
  870. down_read(&current->mm->mmap_sem);
  871. vma = find_vma_intersection(current->mm, hva, hva + 1);
  872. if (unlikely(!vma)) {
  873. kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
  874. up_read(&current->mm->mmap_sem);
  875. return -EFAULT;
  876. }
  877. if (is_vm_hugetlb_page(vma)) {
  878. hugetlb = true;
  879. gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
  880. } else {
  881. /*
  882. * Pages belonging to memslots that don't have the same
  883. * alignment for userspace and IPA cannot be mapped using
  884. * block descriptors even if the pages belong to a THP for
  885. * the process, because the stage-2 block descriptor will
  886. * cover more than a single THP and we loose atomicity for
  887. * unmapping, updates, and splits of the THP or other pages
  888. * in the stage-2 block range.
  889. */
  890. if ((memslot->userspace_addr & ~PMD_MASK) !=
  891. ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
  892. force_pte = true;
  893. }
  894. up_read(&current->mm->mmap_sem);
  895. /* We need minimum second+third level pages */
  896. ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
  897. KVM_NR_MEM_OBJS);
  898. if (ret)
  899. return ret;
  900. mmu_seq = vcpu->kvm->mmu_notifier_seq;
  901. /*
  902. * Ensure the read of mmu_notifier_seq happens before we call
  903. * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
  904. * the page we just got a reference to gets unmapped before we have a
  905. * chance to grab the mmu_lock, which ensure that if the page gets
  906. * unmapped afterwards, the call to kvm_unmap_hva will take it away
  907. * from us again properly. This smp_rmb() interacts with the smp_wmb()
  908. * in kvm_mmu_notifier_invalidate_<page|range_end>.
  909. */
  910. smp_rmb();
  911. pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
  912. if (is_error_pfn(pfn))
  913. return -EFAULT;
  914. if (kvm_is_device_pfn(pfn))
  915. mem_type = PAGE_S2_DEVICE;
  916. spin_lock(&kvm->mmu_lock);
  917. if (mmu_notifier_retry(kvm, mmu_seq))
  918. goto out_unlock;
  919. if (!hugetlb && !force_pte)
  920. hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
  921. fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
  922. if (hugetlb) {
  923. pmd_t new_pmd = pfn_pmd(pfn, mem_type);
  924. new_pmd = pmd_mkhuge(new_pmd);
  925. if (writable) {
  926. kvm_set_s2pmd_writable(&new_pmd);
  927. kvm_set_pfn_dirty(pfn);
  928. }
  929. coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
  930. ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
  931. } else {
  932. pte_t new_pte = pfn_pte(pfn, mem_type);
  933. if (writable) {
  934. kvm_set_s2pte_writable(&new_pte);
  935. kvm_set_pfn_dirty(pfn);
  936. }
  937. coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
  938. ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
  939. pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
  940. }
  941. out_unlock:
  942. spin_unlock(&kvm->mmu_lock);
  943. kvm_release_pfn_clean(pfn);
  944. return ret;
  945. }
  946. /**
  947. * kvm_handle_guest_abort - handles all 2nd stage aborts
  948. * @vcpu: the VCPU pointer
  949. * @run: the kvm_run structure
  950. *
  951. * Any abort that gets to the host is almost guaranteed to be caused by a
  952. * missing second stage translation table entry, which can mean that either the
  953. * guest simply needs more memory and we must allocate an appropriate page or it
  954. * can mean that the guest tried to access I/O memory, which is emulated by user
  955. * space. The distinction is based on the IPA causing the fault and whether this
  956. * memory region has been registered as standard RAM by user space.
  957. */
  958. int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
  959. {
  960. unsigned long fault_status;
  961. phys_addr_t fault_ipa;
  962. struct kvm_memory_slot *memslot;
  963. unsigned long hva;
  964. bool is_iabt, write_fault, writable;
  965. gfn_t gfn;
  966. int ret, idx;
  967. is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
  968. fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
  969. trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
  970. kvm_vcpu_get_hfar(vcpu), fault_ipa);
  971. /* Check the stage-2 fault is trans. fault or write fault */
  972. fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
  973. if (fault_status != FSC_FAULT && fault_status != FSC_PERM) {
  974. kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
  975. kvm_vcpu_trap_get_class(vcpu),
  976. (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
  977. (unsigned long)kvm_vcpu_get_hsr(vcpu));
  978. return -EFAULT;
  979. }
  980. idx = srcu_read_lock(&vcpu->kvm->srcu);
  981. gfn = fault_ipa >> PAGE_SHIFT;
  982. memslot = gfn_to_memslot(vcpu->kvm, gfn);
  983. hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
  984. write_fault = kvm_is_write_fault(vcpu);
  985. if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
  986. if (is_iabt) {
  987. /* Prefetch Abort on I/O address */
  988. kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
  989. ret = 1;
  990. goto out_unlock;
  991. }
  992. /*
  993. * The IPA is reported as [MAX:12], so we need to
  994. * complement it with the bottom 12 bits from the
  995. * faulting VA. This is always 12 bits, irrespective
  996. * of the page size.
  997. */
  998. fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
  999. ret = io_mem_abort(vcpu, run, fault_ipa);
  1000. goto out_unlock;
  1001. }
  1002. /* Userspace should not be able to register out-of-bounds IPAs */
  1003. VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
  1004. ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
  1005. if (ret == 0)
  1006. ret = 1;
  1007. out_unlock:
  1008. srcu_read_unlock(&vcpu->kvm->srcu, idx);
  1009. return ret;
  1010. }
  1011. static void handle_hva_to_gpa(struct kvm *kvm,
  1012. unsigned long start,
  1013. unsigned long end,
  1014. void (*handler)(struct kvm *kvm,
  1015. gpa_t gpa, void *data),
  1016. void *data)
  1017. {
  1018. struct kvm_memslots *slots;
  1019. struct kvm_memory_slot *memslot;
  1020. slots = kvm_memslots(kvm);
  1021. /* we only care about the pages that the guest sees */
  1022. kvm_for_each_memslot(memslot, slots) {
  1023. unsigned long hva_start, hva_end;
  1024. gfn_t gfn, gfn_end;
  1025. hva_start = max(start, memslot->userspace_addr);
  1026. hva_end = min(end, memslot->userspace_addr +
  1027. (memslot->npages << PAGE_SHIFT));
  1028. if (hva_start >= hva_end)
  1029. continue;
  1030. /*
  1031. * {gfn(page) | page intersects with [hva_start, hva_end)} =
  1032. * {gfn_start, gfn_start+1, ..., gfn_end-1}.
  1033. */
  1034. gfn = hva_to_gfn_memslot(hva_start, memslot);
  1035. gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
  1036. for (; gfn < gfn_end; ++gfn) {
  1037. gpa_t gpa = gfn << PAGE_SHIFT;
  1038. handler(kvm, gpa, data);
  1039. }
  1040. }
  1041. }
  1042. static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
  1043. {
  1044. unmap_stage2_range(kvm, gpa, PAGE_SIZE);
  1045. }
  1046. int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
  1047. {
  1048. unsigned long end = hva + PAGE_SIZE;
  1049. if (!kvm->arch.pgd)
  1050. return 0;
  1051. trace_kvm_unmap_hva(hva);
  1052. handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
  1053. return 0;
  1054. }
  1055. int kvm_unmap_hva_range(struct kvm *kvm,
  1056. unsigned long start, unsigned long end)
  1057. {
  1058. if (!kvm->arch.pgd)
  1059. return 0;
  1060. trace_kvm_unmap_hva_range(start, end);
  1061. handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
  1062. return 0;
  1063. }
  1064. static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
  1065. {
  1066. pte_t *pte = (pte_t *)data;
  1067. stage2_set_pte(kvm, NULL, gpa, pte, false);
  1068. }
  1069. void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
  1070. {
  1071. unsigned long end = hva + PAGE_SIZE;
  1072. pte_t stage2_pte;
  1073. if (!kvm->arch.pgd)
  1074. return;
  1075. trace_kvm_set_spte_hva(hva);
  1076. stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
  1077. handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
  1078. }
  1079. void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
  1080. {
  1081. mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
  1082. }
  1083. phys_addr_t kvm_mmu_get_httbr(void)
  1084. {
  1085. return virt_to_phys(hyp_pgd);
  1086. }
  1087. phys_addr_t kvm_mmu_get_boot_httbr(void)
  1088. {
  1089. return virt_to_phys(boot_hyp_pgd);
  1090. }
  1091. phys_addr_t kvm_get_idmap_vector(void)
  1092. {
  1093. return hyp_idmap_vector;
  1094. }
  1095. int kvm_mmu_init(void)
  1096. {
  1097. int err;
  1098. hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
  1099. hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
  1100. hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
  1101. if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) {
  1102. /*
  1103. * Our init code is crossing a page boundary. Allocate
  1104. * a bounce page, copy the code over and use that.
  1105. */
  1106. size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start;
  1107. phys_addr_t phys_base;
  1108. init_bounce_page = (void *)__get_free_page(GFP_KERNEL);
  1109. if (!init_bounce_page) {
  1110. kvm_err("Couldn't allocate HYP init bounce page\n");
  1111. err = -ENOMEM;
  1112. goto out;
  1113. }
  1114. memcpy(init_bounce_page, __hyp_idmap_text_start, len);
  1115. /*
  1116. * Warning: the code we just copied to the bounce page
  1117. * must be flushed to the point of coherency.
  1118. * Otherwise, the data may be sitting in L2, and HYP
  1119. * mode won't be able to observe it as it runs with
  1120. * caches off at that point.
  1121. */
  1122. kvm_flush_dcache_to_poc(init_bounce_page, len);
  1123. phys_base = kvm_virt_to_phys(init_bounce_page);
  1124. hyp_idmap_vector += phys_base - hyp_idmap_start;
  1125. hyp_idmap_start = phys_base;
  1126. hyp_idmap_end = phys_base + len;
  1127. kvm_info("Using HYP init bounce page @%lx\n",
  1128. (unsigned long)phys_base);
  1129. }
  1130. hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
  1131. boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
  1132. if (!hyp_pgd || !boot_hyp_pgd) {
  1133. kvm_err("Hyp mode PGD not allocated\n");
  1134. err = -ENOMEM;
  1135. goto out;
  1136. }
  1137. /* Create the idmap in the boot page tables */
  1138. err = __create_hyp_mappings(boot_hyp_pgd,
  1139. hyp_idmap_start, hyp_idmap_end,
  1140. __phys_to_pfn(hyp_idmap_start),
  1141. PAGE_HYP);
  1142. if (err) {
  1143. kvm_err("Failed to idmap %lx-%lx\n",
  1144. hyp_idmap_start, hyp_idmap_end);
  1145. goto out;
  1146. }
  1147. /* Map the very same page at the trampoline VA */
  1148. err = __create_hyp_mappings(boot_hyp_pgd,
  1149. TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
  1150. __phys_to_pfn(hyp_idmap_start),
  1151. PAGE_HYP);
  1152. if (err) {
  1153. kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
  1154. TRAMPOLINE_VA);
  1155. goto out;
  1156. }
  1157. /* Map the same page again into the runtime page tables */
  1158. err = __create_hyp_mappings(hyp_pgd,
  1159. TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
  1160. __phys_to_pfn(hyp_idmap_start),
  1161. PAGE_HYP);
  1162. if (err) {
  1163. kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
  1164. TRAMPOLINE_VA);
  1165. goto out;
  1166. }
  1167. return 0;
  1168. out:
  1169. free_hyp_pgds();
  1170. return err;
  1171. }
  1172. void kvm_arch_commit_memory_region(struct kvm *kvm,
  1173. struct kvm_userspace_memory_region *mem,
  1174. const struct kvm_memory_slot *old,
  1175. enum kvm_mr_change change)
  1176. {
  1177. }
  1178. int kvm_arch_prepare_memory_region(struct kvm *kvm,
  1179. struct kvm_memory_slot *memslot,
  1180. struct kvm_userspace_memory_region *mem,
  1181. enum kvm_mr_change change)
  1182. {
  1183. hva_t hva = mem->userspace_addr;
  1184. hva_t reg_end = hva + mem->memory_size;
  1185. bool writable = !(mem->flags & KVM_MEM_READONLY);
  1186. int ret = 0;
  1187. if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)
  1188. return 0;
  1189. /*
  1190. * Prevent userspace from creating a memory region outside of the IPA
  1191. * space addressable by the KVM guest IPA space.
  1192. */
  1193. if (memslot->base_gfn + memslot->npages >=
  1194. (KVM_PHYS_SIZE >> PAGE_SHIFT))
  1195. return -EFAULT;
  1196. /*
  1197. * A memory region could potentially cover multiple VMAs, and any holes
  1198. * between them, so iterate over all of them to find out if we can map
  1199. * any of them right now.
  1200. *
  1201. * +--------------------------------------------+
  1202. * +---------------+----------------+ +----------------+
  1203. * | : VMA 1 | VMA 2 | | VMA 3 : |
  1204. * +---------------+----------------+ +----------------+
  1205. * | memory region |
  1206. * +--------------------------------------------+
  1207. */
  1208. do {
  1209. struct vm_area_struct *vma = find_vma(current->mm, hva);
  1210. hva_t vm_start, vm_end;
  1211. if (!vma || vma->vm_start >= reg_end)
  1212. break;
  1213. /*
  1214. * Mapping a read-only VMA is only allowed if the
  1215. * memory region is configured as read-only.
  1216. */
  1217. if (writable && !(vma->vm_flags & VM_WRITE)) {
  1218. ret = -EPERM;
  1219. break;
  1220. }
  1221. /*
  1222. * Take the intersection of this VMA with the memory region
  1223. */
  1224. vm_start = max(hva, vma->vm_start);
  1225. vm_end = min(reg_end, vma->vm_end);
  1226. if (vma->vm_flags & VM_PFNMAP) {
  1227. gpa_t gpa = mem->guest_phys_addr +
  1228. (vm_start - mem->userspace_addr);
  1229. phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) +
  1230. vm_start - vma->vm_start;
  1231. ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
  1232. vm_end - vm_start,
  1233. writable);
  1234. if (ret)
  1235. break;
  1236. }
  1237. hva = vm_end;
  1238. } while (hva < reg_end);
  1239. spin_lock(&kvm->mmu_lock);
  1240. if (ret)
  1241. unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
  1242. else
  1243. stage2_flush_memslot(kvm, memslot);
  1244. spin_unlock(&kvm->mmu_lock);
  1245. return ret;
  1246. }
  1247. void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
  1248. struct kvm_memory_slot *dont)
  1249. {
  1250. }
  1251. int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
  1252. unsigned long npages)
  1253. {
  1254. /*
  1255. * Readonly memslots are not incoherent with the caches by definition,
  1256. * but in practice, they are used mostly to emulate ROMs or NOR flashes
  1257. * that the guest may consider devices and hence map as uncached.
  1258. * To prevent incoherency issues in these cases, tag all readonly
  1259. * regions as incoherent.
  1260. */
  1261. if (slot->flags & KVM_MEM_READONLY)
  1262. slot->flags |= KVM_MEMSLOT_INCOHERENT;
  1263. return 0;
  1264. }
  1265. void kvm_arch_memslots_updated(struct kvm *kvm)
  1266. {
  1267. }
  1268. void kvm_arch_flush_shadow_all(struct kvm *kvm)
  1269. {
  1270. }
  1271. void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
  1272. struct kvm_memory_slot *slot)
  1273. {
  1274. gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
  1275. phys_addr_t size = slot->npages << PAGE_SHIFT;
  1276. spin_lock(&kvm->mmu_lock);
  1277. unmap_stage2_range(kvm, gpa, size);
  1278. spin_unlock(&kvm->mmu_lock);
  1279. }