dm-bufio.c 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872
  1. /*
  2. * Copyright (C) 2009-2011 Red Hat, Inc.
  3. *
  4. * Author: Mikulas Patocka <mpatocka@redhat.com>
  5. *
  6. * This file is released under the GPL.
  7. */
  8. #include "dm-bufio.h"
  9. #include <linux/device-mapper.h>
  10. #include <linux/dm-io.h>
  11. #include <linux/slab.h>
  12. #include <linux/vmalloc.h>
  13. #include <linux/shrinker.h>
  14. #include <linux/module.h>
  15. #define DM_MSG_PREFIX "bufio"
  16. /*
  17. * Memory management policy:
  18. * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
  19. * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
  20. * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
  21. * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
  22. * dirty buffers.
  23. */
  24. #define DM_BUFIO_MIN_BUFFERS 8
  25. #define DM_BUFIO_MEMORY_PERCENT 2
  26. #define DM_BUFIO_VMALLOC_PERCENT 25
  27. #define DM_BUFIO_WRITEBACK_PERCENT 75
  28. /*
  29. * Check buffer ages in this interval (seconds)
  30. */
  31. #define DM_BUFIO_WORK_TIMER_SECS 10
  32. /*
  33. * Free buffers when they are older than this (seconds)
  34. */
  35. #define DM_BUFIO_DEFAULT_AGE_SECS 60
  36. /*
  37. * The number of bvec entries that are embedded directly in the buffer.
  38. * If the chunk size is larger, dm-io is used to do the io.
  39. */
  40. #define DM_BUFIO_INLINE_VECS 16
  41. /*
  42. * Buffer hash
  43. */
  44. #define DM_BUFIO_HASH_BITS 20
  45. #define DM_BUFIO_HASH(block) \
  46. ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
  47. ((1 << DM_BUFIO_HASH_BITS) - 1))
  48. /*
  49. * Don't try to use kmem_cache_alloc for blocks larger than this.
  50. * For explanation, see alloc_buffer_data below.
  51. */
  52. #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1)
  53. #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
  54. /*
  55. * dm_buffer->list_mode
  56. */
  57. #define LIST_CLEAN 0
  58. #define LIST_DIRTY 1
  59. #define LIST_SIZE 2
  60. /*
  61. * Linking of buffers:
  62. * All buffers are linked to cache_hash with their hash_list field.
  63. *
  64. * Clean buffers that are not being written (B_WRITING not set)
  65. * are linked to lru[LIST_CLEAN] with their lru_list field.
  66. *
  67. * Dirty and clean buffers that are being written are linked to
  68. * lru[LIST_DIRTY] with their lru_list field. When the write
  69. * finishes, the buffer cannot be relinked immediately (because we
  70. * are in an interrupt context and relinking requires process
  71. * context), so some clean-not-writing buffers can be held on
  72. * dirty_lru too. They are later added to lru in the process
  73. * context.
  74. */
  75. struct dm_bufio_client {
  76. struct mutex lock;
  77. struct list_head lru[LIST_SIZE];
  78. unsigned long n_buffers[LIST_SIZE];
  79. struct block_device *bdev;
  80. unsigned block_size;
  81. unsigned char sectors_per_block_bits;
  82. unsigned char pages_per_block_bits;
  83. unsigned char blocks_per_page_bits;
  84. unsigned aux_size;
  85. void (*alloc_callback)(struct dm_buffer *);
  86. void (*write_callback)(struct dm_buffer *);
  87. struct dm_io_client *dm_io;
  88. struct list_head reserved_buffers;
  89. unsigned need_reserved_buffers;
  90. unsigned minimum_buffers;
  91. struct hlist_head *cache_hash;
  92. wait_queue_head_t free_buffer_wait;
  93. int async_write_error;
  94. struct list_head client_list;
  95. struct shrinker shrinker;
  96. };
  97. /*
  98. * Buffer state bits.
  99. */
  100. #define B_READING 0
  101. #define B_WRITING 1
  102. #define B_DIRTY 2
  103. /*
  104. * Describes how the block was allocated:
  105. * kmem_cache_alloc(), __get_free_pages() or vmalloc().
  106. * See the comment at alloc_buffer_data.
  107. */
  108. enum data_mode {
  109. DATA_MODE_SLAB = 0,
  110. DATA_MODE_GET_FREE_PAGES = 1,
  111. DATA_MODE_VMALLOC = 2,
  112. DATA_MODE_LIMIT = 3
  113. };
  114. struct dm_buffer {
  115. struct hlist_node hash_list;
  116. struct list_head lru_list;
  117. sector_t block;
  118. void *data;
  119. enum data_mode data_mode;
  120. unsigned char list_mode; /* LIST_* */
  121. unsigned hold_count;
  122. int read_error;
  123. int write_error;
  124. unsigned long state;
  125. unsigned long last_accessed;
  126. struct dm_bufio_client *c;
  127. struct list_head write_list;
  128. struct bio bio;
  129. struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
  130. };
  131. /*----------------------------------------------------------------*/
  132. static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
  133. static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
  134. static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
  135. {
  136. unsigned ret = c->blocks_per_page_bits - 1;
  137. BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
  138. return ret;
  139. }
  140. #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)])
  141. #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)])
  142. #define dm_bufio_in_request() (!!current->bio_list)
  143. static void dm_bufio_lock(struct dm_bufio_client *c)
  144. {
  145. mutex_lock_nested(&c->lock, dm_bufio_in_request());
  146. }
  147. static int dm_bufio_trylock(struct dm_bufio_client *c)
  148. {
  149. return mutex_trylock(&c->lock);
  150. }
  151. static void dm_bufio_unlock(struct dm_bufio_client *c)
  152. {
  153. mutex_unlock(&c->lock);
  154. }
  155. /*
  156. * FIXME Move to sched.h?
  157. */
  158. #ifdef CONFIG_PREEMPT_VOLUNTARY
  159. # define dm_bufio_cond_resched() \
  160. do { \
  161. if (unlikely(need_resched())) \
  162. _cond_resched(); \
  163. } while (0)
  164. #else
  165. # define dm_bufio_cond_resched() do { } while (0)
  166. #endif
  167. /*----------------------------------------------------------------*/
  168. /*
  169. * Default cache size: available memory divided by the ratio.
  170. */
  171. static unsigned long dm_bufio_default_cache_size;
  172. /*
  173. * Total cache size set by the user.
  174. */
  175. static unsigned long dm_bufio_cache_size;
  176. /*
  177. * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
  178. * at any time. If it disagrees, the user has changed cache size.
  179. */
  180. static unsigned long dm_bufio_cache_size_latch;
  181. static DEFINE_SPINLOCK(param_spinlock);
  182. /*
  183. * Buffers are freed after this timeout
  184. */
  185. static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
  186. static unsigned long dm_bufio_peak_allocated;
  187. static unsigned long dm_bufio_allocated_kmem_cache;
  188. static unsigned long dm_bufio_allocated_get_free_pages;
  189. static unsigned long dm_bufio_allocated_vmalloc;
  190. static unsigned long dm_bufio_current_allocated;
  191. /*----------------------------------------------------------------*/
  192. /*
  193. * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
  194. */
  195. static unsigned long dm_bufio_cache_size_per_client;
  196. /*
  197. * The current number of clients.
  198. */
  199. static int dm_bufio_client_count;
  200. /*
  201. * The list of all clients.
  202. */
  203. static LIST_HEAD(dm_bufio_all_clients);
  204. /*
  205. * This mutex protects dm_bufio_cache_size_latch,
  206. * dm_bufio_cache_size_per_client and dm_bufio_client_count
  207. */
  208. static DEFINE_MUTEX(dm_bufio_clients_lock);
  209. /*----------------------------------------------------------------*/
  210. static void adjust_total_allocated(enum data_mode data_mode, long diff)
  211. {
  212. static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
  213. &dm_bufio_allocated_kmem_cache,
  214. &dm_bufio_allocated_get_free_pages,
  215. &dm_bufio_allocated_vmalloc,
  216. };
  217. spin_lock(&param_spinlock);
  218. *class_ptr[data_mode] += diff;
  219. dm_bufio_current_allocated += diff;
  220. if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
  221. dm_bufio_peak_allocated = dm_bufio_current_allocated;
  222. spin_unlock(&param_spinlock);
  223. }
  224. /*
  225. * Change the number of clients and recalculate per-client limit.
  226. */
  227. static void __cache_size_refresh(void)
  228. {
  229. BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
  230. BUG_ON(dm_bufio_client_count < 0);
  231. dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
  232. /*
  233. * Use default if set to 0 and report the actual cache size used.
  234. */
  235. if (!dm_bufio_cache_size_latch) {
  236. (void)cmpxchg(&dm_bufio_cache_size, 0,
  237. dm_bufio_default_cache_size);
  238. dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
  239. }
  240. dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
  241. (dm_bufio_client_count ? : 1);
  242. }
  243. /*
  244. * Allocating buffer data.
  245. *
  246. * Small buffers are allocated with kmem_cache, to use space optimally.
  247. *
  248. * For large buffers, we choose between get_free_pages and vmalloc.
  249. * Each has advantages and disadvantages.
  250. *
  251. * __get_free_pages can randomly fail if the memory is fragmented.
  252. * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
  253. * as low as 128M) so using it for caching is not appropriate.
  254. *
  255. * If the allocation may fail we use __get_free_pages. Memory fragmentation
  256. * won't have a fatal effect here, but it just causes flushes of some other
  257. * buffers and more I/O will be performed. Don't use __get_free_pages if it
  258. * always fails (i.e. order >= MAX_ORDER).
  259. *
  260. * If the allocation shouldn't fail we use __vmalloc. This is only for the
  261. * initial reserve allocation, so there's no risk of wasting all vmalloc
  262. * space.
  263. */
  264. static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
  265. enum data_mode *data_mode)
  266. {
  267. unsigned noio_flag;
  268. void *ptr;
  269. if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
  270. *data_mode = DATA_MODE_SLAB;
  271. return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
  272. }
  273. if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
  274. gfp_mask & __GFP_NORETRY) {
  275. *data_mode = DATA_MODE_GET_FREE_PAGES;
  276. return (void *)__get_free_pages(gfp_mask,
  277. c->pages_per_block_bits);
  278. }
  279. *data_mode = DATA_MODE_VMALLOC;
  280. /*
  281. * __vmalloc allocates the data pages and auxiliary structures with
  282. * gfp_flags that were specified, but pagetables are always allocated
  283. * with GFP_KERNEL, no matter what was specified as gfp_mask.
  284. *
  285. * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
  286. * all allocations done by this process (including pagetables) are done
  287. * as if GFP_NOIO was specified.
  288. */
  289. if (gfp_mask & __GFP_NORETRY)
  290. noio_flag = memalloc_noio_save();
  291. ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
  292. if (gfp_mask & __GFP_NORETRY)
  293. memalloc_noio_restore(noio_flag);
  294. return ptr;
  295. }
  296. /*
  297. * Free buffer's data.
  298. */
  299. static void free_buffer_data(struct dm_bufio_client *c,
  300. void *data, enum data_mode data_mode)
  301. {
  302. switch (data_mode) {
  303. case DATA_MODE_SLAB:
  304. kmem_cache_free(DM_BUFIO_CACHE(c), data);
  305. break;
  306. case DATA_MODE_GET_FREE_PAGES:
  307. free_pages((unsigned long)data, c->pages_per_block_bits);
  308. break;
  309. case DATA_MODE_VMALLOC:
  310. vfree(data);
  311. break;
  312. default:
  313. DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
  314. data_mode);
  315. BUG();
  316. }
  317. }
  318. /*
  319. * Allocate buffer and its data.
  320. */
  321. static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
  322. {
  323. struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
  324. gfp_mask);
  325. if (!b)
  326. return NULL;
  327. b->c = c;
  328. b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
  329. if (!b->data) {
  330. kfree(b);
  331. return NULL;
  332. }
  333. adjust_total_allocated(b->data_mode, (long)c->block_size);
  334. return b;
  335. }
  336. /*
  337. * Free buffer and its data.
  338. */
  339. static void free_buffer(struct dm_buffer *b)
  340. {
  341. struct dm_bufio_client *c = b->c;
  342. adjust_total_allocated(b->data_mode, -(long)c->block_size);
  343. free_buffer_data(c, b->data, b->data_mode);
  344. kfree(b);
  345. }
  346. /*
  347. * Link buffer to the hash list and clean or dirty queue.
  348. */
  349. static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
  350. {
  351. struct dm_bufio_client *c = b->c;
  352. c->n_buffers[dirty]++;
  353. b->block = block;
  354. b->list_mode = dirty;
  355. list_add(&b->lru_list, &c->lru[dirty]);
  356. hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
  357. b->last_accessed = jiffies;
  358. }
  359. /*
  360. * Unlink buffer from the hash list and dirty or clean queue.
  361. */
  362. static void __unlink_buffer(struct dm_buffer *b)
  363. {
  364. struct dm_bufio_client *c = b->c;
  365. BUG_ON(!c->n_buffers[b->list_mode]);
  366. c->n_buffers[b->list_mode]--;
  367. hlist_del(&b->hash_list);
  368. list_del(&b->lru_list);
  369. }
  370. /*
  371. * Place the buffer to the head of dirty or clean LRU queue.
  372. */
  373. static void __relink_lru(struct dm_buffer *b, int dirty)
  374. {
  375. struct dm_bufio_client *c = b->c;
  376. BUG_ON(!c->n_buffers[b->list_mode]);
  377. c->n_buffers[b->list_mode]--;
  378. c->n_buffers[dirty]++;
  379. b->list_mode = dirty;
  380. list_move(&b->lru_list, &c->lru[dirty]);
  381. b->last_accessed = jiffies;
  382. }
  383. /*----------------------------------------------------------------
  384. * Submit I/O on the buffer.
  385. *
  386. * Bio interface is faster but it has some problems:
  387. * the vector list is limited (increasing this limit increases
  388. * memory-consumption per buffer, so it is not viable);
  389. *
  390. * the memory must be direct-mapped, not vmalloced;
  391. *
  392. * the I/O driver can reject requests spuriously if it thinks that
  393. * the requests are too big for the device or if they cross a
  394. * controller-defined memory boundary.
  395. *
  396. * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
  397. * it is not vmalloced, try using the bio interface.
  398. *
  399. * If the buffer is big, if it is vmalloced or if the underlying device
  400. * rejects the bio because it is too large, use dm-io layer to do the I/O.
  401. * The dm-io layer splits the I/O into multiple requests, avoiding the above
  402. * shortcomings.
  403. *--------------------------------------------------------------*/
  404. /*
  405. * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
  406. * that the request was handled directly with bio interface.
  407. */
  408. static void dmio_complete(unsigned long error, void *context)
  409. {
  410. struct dm_buffer *b = context;
  411. b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
  412. }
  413. static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
  414. bio_end_io_t *end_io)
  415. {
  416. int r;
  417. struct dm_io_request io_req = {
  418. .bi_rw = rw,
  419. .notify.fn = dmio_complete,
  420. .notify.context = b,
  421. .client = b->c->dm_io,
  422. };
  423. struct dm_io_region region = {
  424. .bdev = b->c->bdev,
  425. .sector = block << b->c->sectors_per_block_bits,
  426. .count = b->c->block_size >> SECTOR_SHIFT,
  427. };
  428. if (b->data_mode != DATA_MODE_VMALLOC) {
  429. io_req.mem.type = DM_IO_KMEM;
  430. io_req.mem.ptr.addr = b->data;
  431. } else {
  432. io_req.mem.type = DM_IO_VMA;
  433. io_req.mem.ptr.vma = b->data;
  434. }
  435. b->bio.bi_end_io = end_io;
  436. r = dm_io(&io_req, 1, &region, NULL);
  437. if (r)
  438. end_io(&b->bio, r);
  439. }
  440. static void inline_endio(struct bio *bio, int error)
  441. {
  442. bio_end_io_t *end_fn = bio->bi_private;
  443. /*
  444. * Reset the bio to free any attached resources
  445. * (e.g. bio integrity profiles).
  446. */
  447. bio_reset(bio);
  448. end_fn(bio, error);
  449. }
  450. static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
  451. bio_end_io_t *end_io)
  452. {
  453. char *ptr;
  454. int len;
  455. bio_init(&b->bio);
  456. b->bio.bi_io_vec = b->bio_vec;
  457. b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
  458. b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
  459. b->bio.bi_bdev = b->c->bdev;
  460. b->bio.bi_end_io = inline_endio;
  461. /*
  462. * Use of .bi_private isn't a problem here because
  463. * the dm_buffer's inline bio is local to bufio.
  464. */
  465. b->bio.bi_private = end_io;
  466. /*
  467. * We assume that if len >= PAGE_SIZE ptr is page-aligned.
  468. * If len < PAGE_SIZE the buffer doesn't cross page boundary.
  469. */
  470. ptr = b->data;
  471. len = b->c->block_size;
  472. if (len >= PAGE_SIZE)
  473. BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
  474. else
  475. BUG_ON((unsigned long)ptr & (len - 1));
  476. do {
  477. if (!bio_add_page(&b->bio, virt_to_page(ptr),
  478. len < PAGE_SIZE ? len : PAGE_SIZE,
  479. virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
  480. BUG_ON(b->c->block_size <= PAGE_SIZE);
  481. use_dmio(b, rw, block, end_io);
  482. return;
  483. }
  484. len -= PAGE_SIZE;
  485. ptr += PAGE_SIZE;
  486. } while (len > 0);
  487. submit_bio(rw, &b->bio);
  488. }
  489. static void submit_io(struct dm_buffer *b, int rw, sector_t block,
  490. bio_end_io_t *end_io)
  491. {
  492. if (rw == WRITE && b->c->write_callback)
  493. b->c->write_callback(b);
  494. if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
  495. b->data_mode != DATA_MODE_VMALLOC)
  496. use_inline_bio(b, rw, block, end_io);
  497. else
  498. use_dmio(b, rw, block, end_io);
  499. }
  500. /*----------------------------------------------------------------
  501. * Writing dirty buffers
  502. *--------------------------------------------------------------*/
  503. /*
  504. * The endio routine for write.
  505. *
  506. * Set the error, clear B_WRITING bit and wake anyone who was waiting on
  507. * it.
  508. */
  509. static void write_endio(struct bio *bio, int error)
  510. {
  511. struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
  512. b->write_error = error;
  513. if (unlikely(error)) {
  514. struct dm_bufio_client *c = b->c;
  515. (void)cmpxchg(&c->async_write_error, 0, error);
  516. }
  517. BUG_ON(!test_bit(B_WRITING, &b->state));
  518. smp_mb__before_atomic();
  519. clear_bit(B_WRITING, &b->state);
  520. smp_mb__after_atomic();
  521. wake_up_bit(&b->state, B_WRITING);
  522. }
  523. /*
  524. * Initiate a write on a dirty buffer, but don't wait for it.
  525. *
  526. * - If the buffer is not dirty, exit.
  527. * - If there some previous write going on, wait for it to finish (we can't
  528. * have two writes on the same buffer simultaneously).
  529. * - Submit our write and don't wait on it. We set B_WRITING indicating
  530. * that there is a write in progress.
  531. */
  532. static void __write_dirty_buffer(struct dm_buffer *b,
  533. struct list_head *write_list)
  534. {
  535. if (!test_bit(B_DIRTY, &b->state))
  536. return;
  537. clear_bit(B_DIRTY, &b->state);
  538. wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
  539. if (!write_list)
  540. submit_io(b, WRITE, b->block, write_endio);
  541. else
  542. list_add_tail(&b->write_list, write_list);
  543. }
  544. static void __flush_write_list(struct list_head *write_list)
  545. {
  546. struct blk_plug plug;
  547. blk_start_plug(&plug);
  548. while (!list_empty(write_list)) {
  549. struct dm_buffer *b =
  550. list_entry(write_list->next, struct dm_buffer, write_list);
  551. list_del(&b->write_list);
  552. submit_io(b, WRITE, b->block, write_endio);
  553. dm_bufio_cond_resched();
  554. }
  555. blk_finish_plug(&plug);
  556. }
  557. /*
  558. * Wait until any activity on the buffer finishes. Possibly write the
  559. * buffer if it is dirty. When this function finishes, there is no I/O
  560. * running on the buffer and the buffer is not dirty.
  561. */
  562. static void __make_buffer_clean(struct dm_buffer *b)
  563. {
  564. BUG_ON(b->hold_count);
  565. if (!b->state) /* fast case */
  566. return;
  567. wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
  568. __write_dirty_buffer(b, NULL);
  569. wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
  570. }
  571. /*
  572. * Find some buffer that is not held by anybody, clean it, unlink it and
  573. * return it.
  574. */
  575. static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
  576. {
  577. struct dm_buffer *b;
  578. list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
  579. BUG_ON(test_bit(B_WRITING, &b->state));
  580. BUG_ON(test_bit(B_DIRTY, &b->state));
  581. if (!b->hold_count) {
  582. __make_buffer_clean(b);
  583. __unlink_buffer(b);
  584. return b;
  585. }
  586. dm_bufio_cond_resched();
  587. }
  588. list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
  589. BUG_ON(test_bit(B_READING, &b->state));
  590. if (!b->hold_count) {
  591. __make_buffer_clean(b);
  592. __unlink_buffer(b);
  593. return b;
  594. }
  595. dm_bufio_cond_resched();
  596. }
  597. return NULL;
  598. }
  599. /*
  600. * Wait until some other threads free some buffer or release hold count on
  601. * some buffer.
  602. *
  603. * This function is entered with c->lock held, drops it and regains it
  604. * before exiting.
  605. */
  606. static void __wait_for_free_buffer(struct dm_bufio_client *c)
  607. {
  608. DECLARE_WAITQUEUE(wait, current);
  609. add_wait_queue(&c->free_buffer_wait, &wait);
  610. set_task_state(current, TASK_UNINTERRUPTIBLE);
  611. dm_bufio_unlock(c);
  612. io_schedule();
  613. remove_wait_queue(&c->free_buffer_wait, &wait);
  614. dm_bufio_lock(c);
  615. }
  616. enum new_flag {
  617. NF_FRESH = 0,
  618. NF_READ = 1,
  619. NF_GET = 2,
  620. NF_PREFETCH = 3
  621. };
  622. /*
  623. * Allocate a new buffer. If the allocation is not possible, wait until
  624. * some other thread frees a buffer.
  625. *
  626. * May drop the lock and regain it.
  627. */
  628. static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
  629. {
  630. struct dm_buffer *b;
  631. /*
  632. * dm-bufio is resistant to allocation failures (it just keeps
  633. * one buffer reserved in cases all the allocations fail).
  634. * So set flags to not try too hard:
  635. * GFP_NOIO: don't recurse into the I/O layer
  636. * __GFP_NORETRY: don't retry and rather return failure
  637. * __GFP_NOMEMALLOC: don't use emergency reserves
  638. * __GFP_NOWARN: don't print a warning in case of failure
  639. *
  640. * For debugging, if we set the cache size to 1, no new buffers will
  641. * be allocated.
  642. */
  643. while (1) {
  644. if (dm_bufio_cache_size_latch != 1) {
  645. b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
  646. if (b)
  647. return b;
  648. }
  649. if (nf == NF_PREFETCH)
  650. return NULL;
  651. if (!list_empty(&c->reserved_buffers)) {
  652. b = list_entry(c->reserved_buffers.next,
  653. struct dm_buffer, lru_list);
  654. list_del(&b->lru_list);
  655. c->need_reserved_buffers++;
  656. return b;
  657. }
  658. b = __get_unclaimed_buffer(c);
  659. if (b)
  660. return b;
  661. __wait_for_free_buffer(c);
  662. }
  663. }
  664. static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
  665. {
  666. struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
  667. if (!b)
  668. return NULL;
  669. if (c->alloc_callback)
  670. c->alloc_callback(b);
  671. return b;
  672. }
  673. /*
  674. * Free a buffer and wake other threads waiting for free buffers.
  675. */
  676. static void __free_buffer_wake(struct dm_buffer *b)
  677. {
  678. struct dm_bufio_client *c = b->c;
  679. if (!c->need_reserved_buffers)
  680. free_buffer(b);
  681. else {
  682. list_add(&b->lru_list, &c->reserved_buffers);
  683. c->need_reserved_buffers--;
  684. }
  685. wake_up(&c->free_buffer_wait);
  686. }
  687. static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
  688. struct list_head *write_list)
  689. {
  690. struct dm_buffer *b, *tmp;
  691. list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
  692. BUG_ON(test_bit(B_READING, &b->state));
  693. if (!test_bit(B_DIRTY, &b->state) &&
  694. !test_bit(B_WRITING, &b->state)) {
  695. __relink_lru(b, LIST_CLEAN);
  696. continue;
  697. }
  698. if (no_wait && test_bit(B_WRITING, &b->state))
  699. return;
  700. __write_dirty_buffer(b, write_list);
  701. dm_bufio_cond_resched();
  702. }
  703. }
  704. /*
  705. * Get writeback threshold and buffer limit for a given client.
  706. */
  707. static void __get_memory_limit(struct dm_bufio_client *c,
  708. unsigned long *threshold_buffers,
  709. unsigned long *limit_buffers)
  710. {
  711. unsigned long buffers;
  712. if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
  713. mutex_lock(&dm_bufio_clients_lock);
  714. __cache_size_refresh();
  715. mutex_unlock(&dm_bufio_clients_lock);
  716. }
  717. buffers = dm_bufio_cache_size_per_client >>
  718. (c->sectors_per_block_bits + SECTOR_SHIFT);
  719. if (buffers < c->minimum_buffers)
  720. buffers = c->minimum_buffers;
  721. *limit_buffers = buffers;
  722. *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
  723. }
  724. /*
  725. * Check if we're over watermark.
  726. * If we are over threshold_buffers, start freeing buffers.
  727. * If we're over "limit_buffers", block until we get under the limit.
  728. */
  729. static void __check_watermark(struct dm_bufio_client *c,
  730. struct list_head *write_list)
  731. {
  732. unsigned long threshold_buffers, limit_buffers;
  733. __get_memory_limit(c, &threshold_buffers, &limit_buffers);
  734. while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
  735. limit_buffers) {
  736. struct dm_buffer *b = __get_unclaimed_buffer(c);
  737. if (!b)
  738. return;
  739. __free_buffer_wake(b);
  740. dm_bufio_cond_resched();
  741. }
  742. if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
  743. __write_dirty_buffers_async(c, 1, write_list);
  744. }
  745. /*
  746. * Find a buffer in the hash.
  747. */
  748. static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
  749. {
  750. struct dm_buffer *b;
  751. hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
  752. hash_list) {
  753. dm_bufio_cond_resched();
  754. if (b->block == block)
  755. return b;
  756. }
  757. return NULL;
  758. }
  759. /*----------------------------------------------------------------
  760. * Getting a buffer
  761. *--------------------------------------------------------------*/
  762. static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
  763. enum new_flag nf, int *need_submit,
  764. struct list_head *write_list)
  765. {
  766. struct dm_buffer *b, *new_b = NULL;
  767. *need_submit = 0;
  768. b = __find(c, block);
  769. if (b)
  770. goto found_buffer;
  771. if (nf == NF_GET)
  772. return NULL;
  773. new_b = __alloc_buffer_wait(c, nf);
  774. if (!new_b)
  775. return NULL;
  776. /*
  777. * We've had a period where the mutex was unlocked, so need to
  778. * recheck the hash table.
  779. */
  780. b = __find(c, block);
  781. if (b) {
  782. __free_buffer_wake(new_b);
  783. goto found_buffer;
  784. }
  785. __check_watermark(c, write_list);
  786. b = new_b;
  787. b->hold_count = 1;
  788. b->read_error = 0;
  789. b->write_error = 0;
  790. __link_buffer(b, block, LIST_CLEAN);
  791. if (nf == NF_FRESH) {
  792. b->state = 0;
  793. return b;
  794. }
  795. b->state = 1 << B_READING;
  796. *need_submit = 1;
  797. return b;
  798. found_buffer:
  799. if (nf == NF_PREFETCH)
  800. return NULL;
  801. /*
  802. * Note: it is essential that we don't wait for the buffer to be
  803. * read if dm_bufio_get function is used. Both dm_bufio_get and
  804. * dm_bufio_prefetch can be used in the driver request routine.
  805. * If the user called both dm_bufio_prefetch and dm_bufio_get on
  806. * the same buffer, it would deadlock if we waited.
  807. */
  808. if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
  809. return NULL;
  810. b->hold_count++;
  811. __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
  812. test_bit(B_WRITING, &b->state));
  813. return b;
  814. }
  815. /*
  816. * The endio routine for reading: set the error, clear the bit and wake up
  817. * anyone waiting on the buffer.
  818. */
  819. static void read_endio(struct bio *bio, int error)
  820. {
  821. struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
  822. b->read_error = error;
  823. BUG_ON(!test_bit(B_READING, &b->state));
  824. smp_mb__before_atomic();
  825. clear_bit(B_READING, &b->state);
  826. smp_mb__after_atomic();
  827. wake_up_bit(&b->state, B_READING);
  828. }
  829. /*
  830. * A common routine for dm_bufio_new and dm_bufio_read. Operation of these
  831. * functions is similar except that dm_bufio_new doesn't read the
  832. * buffer from the disk (assuming that the caller overwrites all the data
  833. * and uses dm_bufio_mark_buffer_dirty to write new data back).
  834. */
  835. static void *new_read(struct dm_bufio_client *c, sector_t block,
  836. enum new_flag nf, struct dm_buffer **bp)
  837. {
  838. int need_submit;
  839. struct dm_buffer *b;
  840. LIST_HEAD(write_list);
  841. dm_bufio_lock(c);
  842. b = __bufio_new(c, block, nf, &need_submit, &write_list);
  843. dm_bufio_unlock(c);
  844. __flush_write_list(&write_list);
  845. if (!b)
  846. return b;
  847. if (need_submit)
  848. submit_io(b, READ, b->block, read_endio);
  849. wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
  850. if (b->read_error) {
  851. int error = b->read_error;
  852. dm_bufio_release(b);
  853. return ERR_PTR(error);
  854. }
  855. *bp = b;
  856. return b->data;
  857. }
  858. void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
  859. struct dm_buffer **bp)
  860. {
  861. return new_read(c, block, NF_GET, bp);
  862. }
  863. EXPORT_SYMBOL_GPL(dm_bufio_get);
  864. void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
  865. struct dm_buffer **bp)
  866. {
  867. BUG_ON(dm_bufio_in_request());
  868. return new_read(c, block, NF_READ, bp);
  869. }
  870. EXPORT_SYMBOL_GPL(dm_bufio_read);
  871. void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
  872. struct dm_buffer **bp)
  873. {
  874. BUG_ON(dm_bufio_in_request());
  875. return new_read(c, block, NF_FRESH, bp);
  876. }
  877. EXPORT_SYMBOL_GPL(dm_bufio_new);
  878. void dm_bufio_prefetch(struct dm_bufio_client *c,
  879. sector_t block, unsigned n_blocks)
  880. {
  881. struct blk_plug plug;
  882. LIST_HEAD(write_list);
  883. BUG_ON(dm_bufio_in_request());
  884. blk_start_plug(&plug);
  885. dm_bufio_lock(c);
  886. for (; n_blocks--; block++) {
  887. int need_submit;
  888. struct dm_buffer *b;
  889. b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
  890. &write_list);
  891. if (unlikely(!list_empty(&write_list))) {
  892. dm_bufio_unlock(c);
  893. blk_finish_plug(&plug);
  894. __flush_write_list(&write_list);
  895. blk_start_plug(&plug);
  896. dm_bufio_lock(c);
  897. }
  898. if (unlikely(b != NULL)) {
  899. dm_bufio_unlock(c);
  900. if (need_submit)
  901. submit_io(b, READ, b->block, read_endio);
  902. dm_bufio_release(b);
  903. dm_bufio_cond_resched();
  904. if (!n_blocks)
  905. goto flush_plug;
  906. dm_bufio_lock(c);
  907. }
  908. }
  909. dm_bufio_unlock(c);
  910. flush_plug:
  911. blk_finish_plug(&plug);
  912. }
  913. EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
  914. void dm_bufio_release(struct dm_buffer *b)
  915. {
  916. struct dm_bufio_client *c = b->c;
  917. dm_bufio_lock(c);
  918. BUG_ON(!b->hold_count);
  919. b->hold_count--;
  920. if (!b->hold_count) {
  921. wake_up(&c->free_buffer_wait);
  922. /*
  923. * If there were errors on the buffer, and the buffer is not
  924. * to be written, free the buffer. There is no point in caching
  925. * invalid buffer.
  926. */
  927. if ((b->read_error || b->write_error) &&
  928. !test_bit(B_READING, &b->state) &&
  929. !test_bit(B_WRITING, &b->state) &&
  930. !test_bit(B_DIRTY, &b->state)) {
  931. __unlink_buffer(b);
  932. __free_buffer_wake(b);
  933. }
  934. }
  935. dm_bufio_unlock(c);
  936. }
  937. EXPORT_SYMBOL_GPL(dm_bufio_release);
  938. void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
  939. {
  940. struct dm_bufio_client *c = b->c;
  941. dm_bufio_lock(c);
  942. BUG_ON(test_bit(B_READING, &b->state));
  943. if (!test_and_set_bit(B_DIRTY, &b->state))
  944. __relink_lru(b, LIST_DIRTY);
  945. dm_bufio_unlock(c);
  946. }
  947. EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
  948. void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
  949. {
  950. LIST_HEAD(write_list);
  951. BUG_ON(dm_bufio_in_request());
  952. dm_bufio_lock(c);
  953. __write_dirty_buffers_async(c, 0, &write_list);
  954. dm_bufio_unlock(c);
  955. __flush_write_list(&write_list);
  956. }
  957. EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
  958. /*
  959. * For performance, it is essential that the buffers are written asynchronously
  960. * and simultaneously (so that the block layer can merge the writes) and then
  961. * waited upon.
  962. *
  963. * Finally, we flush hardware disk cache.
  964. */
  965. int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
  966. {
  967. int a, f;
  968. unsigned long buffers_processed = 0;
  969. struct dm_buffer *b, *tmp;
  970. LIST_HEAD(write_list);
  971. dm_bufio_lock(c);
  972. __write_dirty_buffers_async(c, 0, &write_list);
  973. dm_bufio_unlock(c);
  974. __flush_write_list(&write_list);
  975. dm_bufio_lock(c);
  976. again:
  977. list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
  978. int dropped_lock = 0;
  979. if (buffers_processed < c->n_buffers[LIST_DIRTY])
  980. buffers_processed++;
  981. BUG_ON(test_bit(B_READING, &b->state));
  982. if (test_bit(B_WRITING, &b->state)) {
  983. if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
  984. dropped_lock = 1;
  985. b->hold_count++;
  986. dm_bufio_unlock(c);
  987. wait_on_bit_io(&b->state, B_WRITING,
  988. TASK_UNINTERRUPTIBLE);
  989. dm_bufio_lock(c);
  990. b->hold_count--;
  991. } else
  992. wait_on_bit_io(&b->state, B_WRITING,
  993. TASK_UNINTERRUPTIBLE);
  994. }
  995. if (!test_bit(B_DIRTY, &b->state) &&
  996. !test_bit(B_WRITING, &b->state))
  997. __relink_lru(b, LIST_CLEAN);
  998. dm_bufio_cond_resched();
  999. /*
  1000. * If we dropped the lock, the list is no longer consistent,
  1001. * so we must restart the search.
  1002. *
  1003. * In the most common case, the buffer just processed is
  1004. * relinked to the clean list, so we won't loop scanning the
  1005. * same buffer again and again.
  1006. *
  1007. * This may livelock if there is another thread simultaneously
  1008. * dirtying buffers, so we count the number of buffers walked
  1009. * and if it exceeds the total number of buffers, it means that
  1010. * someone is doing some writes simultaneously with us. In
  1011. * this case, stop, dropping the lock.
  1012. */
  1013. if (dropped_lock)
  1014. goto again;
  1015. }
  1016. wake_up(&c->free_buffer_wait);
  1017. dm_bufio_unlock(c);
  1018. a = xchg(&c->async_write_error, 0);
  1019. f = dm_bufio_issue_flush(c);
  1020. if (a)
  1021. return a;
  1022. return f;
  1023. }
  1024. EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
  1025. /*
  1026. * Use dm-io to send and empty barrier flush the device.
  1027. */
  1028. int dm_bufio_issue_flush(struct dm_bufio_client *c)
  1029. {
  1030. struct dm_io_request io_req = {
  1031. .bi_rw = WRITE_FLUSH,
  1032. .mem.type = DM_IO_KMEM,
  1033. .mem.ptr.addr = NULL,
  1034. .client = c->dm_io,
  1035. };
  1036. struct dm_io_region io_reg = {
  1037. .bdev = c->bdev,
  1038. .sector = 0,
  1039. .count = 0,
  1040. };
  1041. BUG_ON(dm_bufio_in_request());
  1042. return dm_io(&io_req, 1, &io_reg, NULL);
  1043. }
  1044. EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
  1045. /*
  1046. * We first delete any other buffer that may be at that new location.
  1047. *
  1048. * Then, we write the buffer to the original location if it was dirty.
  1049. *
  1050. * Then, if we are the only one who is holding the buffer, relink the buffer
  1051. * in the hash queue for the new location.
  1052. *
  1053. * If there was someone else holding the buffer, we write it to the new
  1054. * location but not relink it, because that other user needs to have the buffer
  1055. * at the same place.
  1056. */
  1057. void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
  1058. {
  1059. struct dm_bufio_client *c = b->c;
  1060. struct dm_buffer *new;
  1061. BUG_ON(dm_bufio_in_request());
  1062. dm_bufio_lock(c);
  1063. retry:
  1064. new = __find(c, new_block);
  1065. if (new) {
  1066. if (new->hold_count) {
  1067. __wait_for_free_buffer(c);
  1068. goto retry;
  1069. }
  1070. /*
  1071. * FIXME: Is there any point waiting for a write that's going
  1072. * to be overwritten in a bit?
  1073. */
  1074. __make_buffer_clean(new);
  1075. __unlink_buffer(new);
  1076. __free_buffer_wake(new);
  1077. }
  1078. BUG_ON(!b->hold_count);
  1079. BUG_ON(test_bit(B_READING, &b->state));
  1080. __write_dirty_buffer(b, NULL);
  1081. if (b->hold_count == 1) {
  1082. wait_on_bit_io(&b->state, B_WRITING,
  1083. TASK_UNINTERRUPTIBLE);
  1084. set_bit(B_DIRTY, &b->state);
  1085. __unlink_buffer(b);
  1086. __link_buffer(b, new_block, LIST_DIRTY);
  1087. } else {
  1088. sector_t old_block;
  1089. wait_on_bit_lock_io(&b->state, B_WRITING,
  1090. TASK_UNINTERRUPTIBLE);
  1091. /*
  1092. * Relink buffer to "new_block" so that write_callback
  1093. * sees "new_block" as a block number.
  1094. * After the write, link the buffer back to old_block.
  1095. * All this must be done in bufio lock, so that block number
  1096. * change isn't visible to other threads.
  1097. */
  1098. old_block = b->block;
  1099. __unlink_buffer(b);
  1100. __link_buffer(b, new_block, b->list_mode);
  1101. submit_io(b, WRITE, new_block, write_endio);
  1102. wait_on_bit_io(&b->state, B_WRITING,
  1103. TASK_UNINTERRUPTIBLE);
  1104. __unlink_buffer(b);
  1105. __link_buffer(b, old_block, b->list_mode);
  1106. }
  1107. dm_bufio_unlock(c);
  1108. dm_bufio_release(b);
  1109. }
  1110. EXPORT_SYMBOL_GPL(dm_bufio_release_move);
  1111. /*
  1112. * Free the given buffer.
  1113. *
  1114. * This is just a hint, if the buffer is in use or dirty, this function
  1115. * does nothing.
  1116. */
  1117. void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
  1118. {
  1119. struct dm_buffer *b;
  1120. dm_bufio_lock(c);
  1121. b = __find(c, block);
  1122. if (b && likely(!b->hold_count) && likely(!b->state)) {
  1123. __unlink_buffer(b);
  1124. __free_buffer_wake(b);
  1125. }
  1126. dm_bufio_unlock(c);
  1127. }
  1128. EXPORT_SYMBOL(dm_bufio_forget);
  1129. void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
  1130. {
  1131. c->minimum_buffers = n;
  1132. }
  1133. EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
  1134. unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
  1135. {
  1136. return c->block_size;
  1137. }
  1138. EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
  1139. sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
  1140. {
  1141. return i_size_read(c->bdev->bd_inode) >>
  1142. (SECTOR_SHIFT + c->sectors_per_block_bits);
  1143. }
  1144. EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
  1145. sector_t dm_bufio_get_block_number(struct dm_buffer *b)
  1146. {
  1147. return b->block;
  1148. }
  1149. EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
  1150. void *dm_bufio_get_block_data(struct dm_buffer *b)
  1151. {
  1152. return b->data;
  1153. }
  1154. EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
  1155. void *dm_bufio_get_aux_data(struct dm_buffer *b)
  1156. {
  1157. return b + 1;
  1158. }
  1159. EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
  1160. struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
  1161. {
  1162. return b->c;
  1163. }
  1164. EXPORT_SYMBOL_GPL(dm_bufio_get_client);
  1165. static void drop_buffers(struct dm_bufio_client *c)
  1166. {
  1167. struct dm_buffer *b;
  1168. int i;
  1169. BUG_ON(dm_bufio_in_request());
  1170. /*
  1171. * An optimization so that the buffers are not written one-by-one.
  1172. */
  1173. dm_bufio_write_dirty_buffers_async(c);
  1174. dm_bufio_lock(c);
  1175. while ((b = __get_unclaimed_buffer(c)))
  1176. __free_buffer_wake(b);
  1177. for (i = 0; i < LIST_SIZE; i++)
  1178. list_for_each_entry(b, &c->lru[i], lru_list)
  1179. DMERR("leaked buffer %llx, hold count %u, list %d",
  1180. (unsigned long long)b->block, b->hold_count, i);
  1181. for (i = 0; i < LIST_SIZE; i++)
  1182. BUG_ON(!list_empty(&c->lru[i]));
  1183. dm_bufio_unlock(c);
  1184. }
  1185. /*
  1186. * Test if the buffer is unused and too old, and commit it.
  1187. * And if GFP_NOFS is used, we must not do any I/O because we hold
  1188. * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
  1189. * rerouted to different bufio client.
  1190. */
  1191. static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
  1192. unsigned long max_jiffies)
  1193. {
  1194. if (jiffies - b->last_accessed < max_jiffies)
  1195. return 0;
  1196. if (!(gfp & __GFP_FS)) {
  1197. if (test_bit(B_READING, &b->state) ||
  1198. test_bit(B_WRITING, &b->state) ||
  1199. test_bit(B_DIRTY, &b->state))
  1200. return 0;
  1201. }
  1202. if (b->hold_count)
  1203. return 0;
  1204. __make_buffer_clean(b);
  1205. __unlink_buffer(b);
  1206. __free_buffer_wake(b);
  1207. return 1;
  1208. }
  1209. static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
  1210. gfp_t gfp_mask)
  1211. {
  1212. int l;
  1213. struct dm_buffer *b, *tmp;
  1214. long freed = 0;
  1215. for (l = 0; l < LIST_SIZE; l++) {
  1216. list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
  1217. freed += __cleanup_old_buffer(b, gfp_mask, 0);
  1218. if (!--nr_to_scan)
  1219. return freed;
  1220. dm_bufio_cond_resched();
  1221. }
  1222. }
  1223. return freed;
  1224. }
  1225. static unsigned long
  1226. dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
  1227. {
  1228. struct dm_bufio_client *c;
  1229. unsigned long freed;
  1230. c = container_of(shrink, struct dm_bufio_client, shrinker);
  1231. if (sc->gfp_mask & __GFP_FS)
  1232. dm_bufio_lock(c);
  1233. else if (!dm_bufio_trylock(c))
  1234. return SHRINK_STOP;
  1235. freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
  1236. dm_bufio_unlock(c);
  1237. return freed;
  1238. }
  1239. static unsigned long
  1240. dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
  1241. {
  1242. struct dm_bufio_client *c;
  1243. unsigned long count;
  1244. c = container_of(shrink, struct dm_bufio_client, shrinker);
  1245. if (sc->gfp_mask & __GFP_FS)
  1246. dm_bufio_lock(c);
  1247. else if (!dm_bufio_trylock(c))
  1248. return 0;
  1249. count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
  1250. dm_bufio_unlock(c);
  1251. return count;
  1252. }
  1253. /*
  1254. * Create the buffering interface
  1255. */
  1256. struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
  1257. unsigned reserved_buffers, unsigned aux_size,
  1258. void (*alloc_callback)(struct dm_buffer *),
  1259. void (*write_callback)(struct dm_buffer *))
  1260. {
  1261. int r;
  1262. struct dm_bufio_client *c;
  1263. unsigned i;
  1264. BUG_ON(block_size < 1 << SECTOR_SHIFT ||
  1265. (block_size & (block_size - 1)));
  1266. c = kzalloc(sizeof(*c), GFP_KERNEL);
  1267. if (!c) {
  1268. r = -ENOMEM;
  1269. goto bad_client;
  1270. }
  1271. c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
  1272. if (!c->cache_hash) {
  1273. r = -ENOMEM;
  1274. goto bad_hash;
  1275. }
  1276. c->bdev = bdev;
  1277. c->block_size = block_size;
  1278. c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
  1279. c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
  1280. ffs(block_size) - 1 - PAGE_SHIFT : 0;
  1281. c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
  1282. PAGE_SHIFT - (ffs(block_size) - 1) : 0);
  1283. c->aux_size = aux_size;
  1284. c->alloc_callback = alloc_callback;
  1285. c->write_callback = write_callback;
  1286. for (i = 0; i < LIST_SIZE; i++) {
  1287. INIT_LIST_HEAD(&c->lru[i]);
  1288. c->n_buffers[i] = 0;
  1289. }
  1290. for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
  1291. INIT_HLIST_HEAD(&c->cache_hash[i]);
  1292. mutex_init(&c->lock);
  1293. INIT_LIST_HEAD(&c->reserved_buffers);
  1294. c->need_reserved_buffers = reserved_buffers;
  1295. c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
  1296. init_waitqueue_head(&c->free_buffer_wait);
  1297. c->async_write_error = 0;
  1298. c->dm_io = dm_io_client_create();
  1299. if (IS_ERR(c->dm_io)) {
  1300. r = PTR_ERR(c->dm_io);
  1301. goto bad_dm_io;
  1302. }
  1303. mutex_lock(&dm_bufio_clients_lock);
  1304. if (c->blocks_per_page_bits) {
  1305. if (!DM_BUFIO_CACHE_NAME(c)) {
  1306. DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
  1307. if (!DM_BUFIO_CACHE_NAME(c)) {
  1308. r = -ENOMEM;
  1309. mutex_unlock(&dm_bufio_clients_lock);
  1310. goto bad_cache;
  1311. }
  1312. }
  1313. if (!DM_BUFIO_CACHE(c)) {
  1314. DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
  1315. c->block_size,
  1316. c->block_size, 0, NULL);
  1317. if (!DM_BUFIO_CACHE(c)) {
  1318. r = -ENOMEM;
  1319. mutex_unlock(&dm_bufio_clients_lock);
  1320. goto bad_cache;
  1321. }
  1322. }
  1323. }
  1324. mutex_unlock(&dm_bufio_clients_lock);
  1325. while (c->need_reserved_buffers) {
  1326. struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
  1327. if (!b) {
  1328. r = -ENOMEM;
  1329. goto bad_buffer;
  1330. }
  1331. __free_buffer_wake(b);
  1332. }
  1333. mutex_lock(&dm_bufio_clients_lock);
  1334. dm_bufio_client_count++;
  1335. list_add(&c->client_list, &dm_bufio_all_clients);
  1336. __cache_size_refresh();
  1337. mutex_unlock(&dm_bufio_clients_lock);
  1338. c->shrinker.count_objects = dm_bufio_shrink_count;
  1339. c->shrinker.scan_objects = dm_bufio_shrink_scan;
  1340. c->shrinker.seeks = 1;
  1341. c->shrinker.batch = 0;
  1342. register_shrinker(&c->shrinker);
  1343. return c;
  1344. bad_buffer:
  1345. bad_cache:
  1346. while (!list_empty(&c->reserved_buffers)) {
  1347. struct dm_buffer *b = list_entry(c->reserved_buffers.next,
  1348. struct dm_buffer, lru_list);
  1349. list_del(&b->lru_list);
  1350. free_buffer(b);
  1351. }
  1352. dm_io_client_destroy(c->dm_io);
  1353. bad_dm_io:
  1354. vfree(c->cache_hash);
  1355. bad_hash:
  1356. kfree(c);
  1357. bad_client:
  1358. return ERR_PTR(r);
  1359. }
  1360. EXPORT_SYMBOL_GPL(dm_bufio_client_create);
  1361. /*
  1362. * Free the buffering interface.
  1363. * It is required that there are no references on any buffers.
  1364. */
  1365. void dm_bufio_client_destroy(struct dm_bufio_client *c)
  1366. {
  1367. unsigned i;
  1368. drop_buffers(c);
  1369. unregister_shrinker(&c->shrinker);
  1370. mutex_lock(&dm_bufio_clients_lock);
  1371. list_del(&c->client_list);
  1372. dm_bufio_client_count--;
  1373. __cache_size_refresh();
  1374. mutex_unlock(&dm_bufio_clients_lock);
  1375. for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
  1376. BUG_ON(!hlist_empty(&c->cache_hash[i]));
  1377. BUG_ON(c->need_reserved_buffers);
  1378. while (!list_empty(&c->reserved_buffers)) {
  1379. struct dm_buffer *b = list_entry(c->reserved_buffers.next,
  1380. struct dm_buffer, lru_list);
  1381. list_del(&b->lru_list);
  1382. free_buffer(b);
  1383. }
  1384. for (i = 0; i < LIST_SIZE; i++)
  1385. if (c->n_buffers[i])
  1386. DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
  1387. for (i = 0; i < LIST_SIZE; i++)
  1388. BUG_ON(c->n_buffers[i]);
  1389. dm_io_client_destroy(c->dm_io);
  1390. vfree(c->cache_hash);
  1391. kfree(c);
  1392. }
  1393. EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
  1394. static void cleanup_old_buffers(void)
  1395. {
  1396. unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
  1397. struct dm_bufio_client *c;
  1398. if (max_age > ULONG_MAX / HZ)
  1399. max_age = ULONG_MAX / HZ;
  1400. mutex_lock(&dm_bufio_clients_lock);
  1401. list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
  1402. if (!dm_bufio_trylock(c))
  1403. continue;
  1404. while (!list_empty(&c->lru[LIST_CLEAN])) {
  1405. struct dm_buffer *b;
  1406. b = list_entry(c->lru[LIST_CLEAN].prev,
  1407. struct dm_buffer, lru_list);
  1408. if (!__cleanup_old_buffer(b, 0, max_age * HZ))
  1409. break;
  1410. dm_bufio_cond_resched();
  1411. }
  1412. dm_bufio_unlock(c);
  1413. dm_bufio_cond_resched();
  1414. }
  1415. mutex_unlock(&dm_bufio_clients_lock);
  1416. }
  1417. static struct workqueue_struct *dm_bufio_wq;
  1418. static struct delayed_work dm_bufio_work;
  1419. static void work_fn(struct work_struct *w)
  1420. {
  1421. cleanup_old_buffers();
  1422. queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
  1423. DM_BUFIO_WORK_TIMER_SECS * HZ);
  1424. }
  1425. /*----------------------------------------------------------------
  1426. * Module setup
  1427. *--------------------------------------------------------------*/
  1428. /*
  1429. * This is called only once for the whole dm_bufio module.
  1430. * It initializes memory limit.
  1431. */
  1432. static int __init dm_bufio_init(void)
  1433. {
  1434. __u64 mem;
  1435. dm_bufio_allocated_kmem_cache = 0;
  1436. dm_bufio_allocated_get_free_pages = 0;
  1437. dm_bufio_allocated_vmalloc = 0;
  1438. dm_bufio_current_allocated = 0;
  1439. memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
  1440. memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
  1441. mem = (__u64)((totalram_pages - totalhigh_pages) *
  1442. DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
  1443. if (mem > ULONG_MAX)
  1444. mem = ULONG_MAX;
  1445. #ifdef CONFIG_MMU
  1446. /*
  1447. * Get the size of vmalloc space the same way as VMALLOC_TOTAL
  1448. * in fs/proc/internal.h
  1449. */
  1450. if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
  1451. mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
  1452. #endif
  1453. dm_bufio_default_cache_size = mem;
  1454. mutex_lock(&dm_bufio_clients_lock);
  1455. __cache_size_refresh();
  1456. mutex_unlock(&dm_bufio_clients_lock);
  1457. dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
  1458. if (!dm_bufio_wq)
  1459. return -ENOMEM;
  1460. INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
  1461. queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
  1462. DM_BUFIO_WORK_TIMER_SECS * HZ);
  1463. return 0;
  1464. }
  1465. /*
  1466. * This is called once when unloading the dm_bufio module.
  1467. */
  1468. static void __exit dm_bufio_exit(void)
  1469. {
  1470. int bug = 0;
  1471. int i;
  1472. cancel_delayed_work_sync(&dm_bufio_work);
  1473. destroy_workqueue(dm_bufio_wq);
  1474. for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
  1475. struct kmem_cache *kc = dm_bufio_caches[i];
  1476. if (kc)
  1477. kmem_cache_destroy(kc);
  1478. }
  1479. for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
  1480. kfree(dm_bufio_cache_names[i]);
  1481. if (dm_bufio_client_count) {
  1482. DMCRIT("%s: dm_bufio_client_count leaked: %d",
  1483. __func__, dm_bufio_client_count);
  1484. bug = 1;
  1485. }
  1486. if (dm_bufio_current_allocated) {
  1487. DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
  1488. __func__, dm_bufio_current_allocated);
  1489. bug = 1;
  1490. }
  1491. if (dm_bufio_allocated_get_free_pages) {
  1492. DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
  1493. __func__, dm_bufio_allocated_get_free_pages);
  1494. bug = 1;
  1495. }
  1496. if (dm_bufio_allocated_vmalloc) {
  1497. DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
  1498. __func__, dm_bufio_allocated_vmalloc);
  1499. bug = 1;
  1500. }
  1501. if (bug)
  1502. BUG();
  1503. }
  1504. module_init(dm_bufio_init)
  1505. module_exit(dm_bufio_exit)
  1506. module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
  1507. MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
  1508. module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
  1509. MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
  1510. module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
  1511. MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
  1512. module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
  1513. MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
  1514. module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
  1515. MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
  1516. module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
  1517. MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
  1518. module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
  1519. MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
  1520. MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
  1521. MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
  1522. MODULE_LICENSE("GPL");