blk-mq.c 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142
  1. /*
  2. * Block multiqueue core code
  3. *
  4. * Copyright (C) 2013-2014 Jens Axboe
  5. * Copyright (C) 2013-2014 Christoph Hellwig
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/module.h>
  9. #include <linux/backing-dev.h>
  10. #include <linux/bio.h>
  11. #include <linux/blkdev.h>
  12. #include <linux/mm.h>
  13. #include <linux/init.h>
  14. #include <linux/slab.h>
  15. #include <linux/workqueue.h>
  16. #include <linux/smp.h>
  17. #include <linux/llist.h>
  18. #include <linux/list_sort.h>
  19. #include <linux/cpu.h>
  20. #include <linux/cache.h>
  21. #include <linux/sched/sysctl.h>
  22. #include <linux/delay.h>
  23. #include <linux/crash_dump.h>
  24. #include <trace/events/block.h>
  25. #include <linux/blk-mq.h>
  26. #include "blk.h"
  27. #include "blk-mq.h"
  28. #include "blk-mq-tag.h"
  29. static DEFINE_MUTEX(all_q_mutex);
  30. static LIST_HEAD(all_q_list);
  31. static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  32. /*
  33. * Check if any of the ctx's have pending work in this hardware queue
  34. */
  35. static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  36. {
  37. unsigned int i;
  38. for (i = 0; i < hctx->ctx_map.map_size; i++)
  39. if (hctx->ctx_map.map[i].word)
  40. return true;
  41. return false;
  42. }
  43. static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
  44. struct blk_mq_ctx *ctx)
  45. {
  46. return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
  47. }
  48. #define CTX_TO_BIT(hctx, ctx) \
  49. ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
  50. /*
  51. * Mark this ctx as having pending work in this hardware queue
  52. */
  53. static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  54. struct blk_mq_ctx *ctx)
  55. {
  56. struct blk_align_bitmap *bm = get_bm(hctx, ctx);
  57. if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
  58. set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
  59. }
  60. static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  61. struct blk_mq_ctx *ctx)
  62. {
  63. struct blk_align_bitmap *bm = get_bm(hctx, ctx);
  64. clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
  65. }
  66. static int blk_mq_queue_enter(struct request_queue *q)
  67. {
  68. while (true) {
  69. int ret;
  70. if (percpu_ref_tryget_live(&q->mq_usage_counter))
  71. return 0;
  72. ret = wait_event_interruptible(q->mq_freeze_wq,
  73. !q->mq_freeze_depth || blk_queue_dying(q));
  74. if (blk_queue_dying(q))
  75. return -ENODEV;
  76. if (ret)
  77. return ret;
  78. }
  79. }
  80. static void blk_mq_queue_exit(struct request_queue *q)
  81. {
  82. percpu_ref_put(&q->mq_usage_counter);
  83. }
  84. static void blk_mq_usage_counter_release(struct percpu_ref *ref)
  85. {
  86. struct request_queue *q =
  87. container_of(ref, struct request_queue, mq_usage_counter);
  88. wake_up_all(&q->mq_freeze_wq);
  89. }
  90. static void blk_mq_freeze_queue_start(struct request_queue *q)
  91. {
  92. bool freeze;
  93. spin_lock_irq(q->queue_lock);
  94. freeze = !q->mq_freeze_depth++;
  95. spin_unlock_irq(q->queue_lock);
  96. if (freeze) {
  97. percpu_ref_kill(&q->mq_usage_counter);
  98. blk_mq_run_queues(q, false);
  99. }
  100. }
  101. static void blk_mq_freeze_queue_wait(struct request_queue *q)
  102. {
  103. wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
  104. }
  105. /*
  106. * Guarantee no request is in use, so we can change any data structure of
  107. * the queue afterward.
  108. */
  109. void blk_mq_freeze_queue(struct request_queue *q)
  110. {
  111. blk_mq_freeze_queue_start(q);
  112. blk_mq_freeze_queue_wait(q);
  113. }
  114. static void blk_mq_unfreeze_queue(struct request_queue *q)
  115. {
  116. bool wake;
  117. spin_lock_irq(q->queue_lock);
  118. wake = !--q->mq_freeze_depth;
  119. WARN_ON_ONCE(q->mq_freeze_depth < 0);
  120. spin_unlock_irq(q->queue_lock);
  121. if (wake) {
  122. percpu_ref_reinit(&q->mq_usage_counter);
  123. wake_up_all(&q->mq_freeze_wq);
  124. }
  125. }
  126. bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
  127. {
  128. return blk_mq_has_free_tags(hctx->tags);
  129. }
  130. EXPORT_SYMBOL(blk_mq_can_queue);
  131. static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
  132. struct request *rq, unsigned int rw_flags)
  133. {
  134. if (blk_queue_io_stat(q))
  135. rw_flags |= REQ_IO_STAT;
  136. INIT_LIST_HEAD(&rq->queuelist);
  137. /* csd/requeue_work/fifo_time is initialized before use */
  138. rq->q = q;
  139. rq->mq_ctx = ctx;
  140. rq->cmd_flags |= rw_flags;
  141. /* do not touch atomic flags, it needs atomic ops against the timer */
  142. rq->cpu = -1;
  143. INIT_HLIST_NODE(&rq->hash);
  144. RB_CLEAR_NODE(&rq->rb_node);
  145. rq->rq_disk = NULL;
  146. rq->part = NULL;
  147. rq->start_time = jiffies;
  148. #ifdef CONFIG_BLK_CGROUP
  149. rq->rl = NULL;
  150. set_start_time_ns(rq);
  151. rq->io_start_time_ns = 0;
  152. #endif
  153. rq->nr_phys_segments = 0;
  154. #if defined(CONFIG_BLK_DEV_INTEGRITY)
  155. rq->nr_integrity_segments = 0;
  156. #endif
  157. rq->special = NULL;
  158. /* tag was already set */
  159. rq->errors = 0;
  160. rq->cmd = rq->__cmd;
  161. rq->extra_len = 0;
  162. rq->sense_len = 0;
  163. rq->resid_len = 0;
  164. rq->sense = NULL;
  165. INIT_LIST_HEAD(&rq->timeout_list);
  166. rq->timeout = 0;
  167. rq->end_io = NULL;
  168. rq->end_io_data = NULL;
  169. rq->next_rq = NULL;
  170. ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
  171. }
  172. static struct request *
  173. __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
  174. {
  175. struct request *rq;
  176. unsigned int tag;
  177. tag = blk_mq_get_tag(data);
  178. if (tag != BLK_MQ_TAG_FAIL) {
  179. rq = data->hctx->tags->rqs[tag];
  180. if (blk_mq_tag_busy(data->hctx)) {
  181. rq->cmd_flags = REQ_MQ_INFLIGHT;
  182. atomic_inc(&data->hctx->nr_active);
  183. }
  184. rq->tag = tag;
  185. blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
  186. return rq;
  187. }
  188. return NULL;
  189. }
  190. struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
  191. bool reserved)
  192. {
  193. struct blk_mq_ctx *ctx;
  194. struct blk_mq_hw_ctx *hctx;
  195. struct request *rq;
  196. struct blk_mq_alloc_data alloc_data;
  197. int ret;
  198. ret = blk_mq_queue_enter(q);
  199. if (ret)
  200. return ERR_PTR(ret);
  201. ctx = blk_mq_get_ctx(q);
  202. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  203. blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
  204. reserved, ctx, hctx);
  205. rq = __blk_mq_alloc_request(&alloc_data, rw);
  206. if (!rq && (gfp & __GFP_WAIT)) {
  207. __blk_mq_run_hw_queue(hctx);
  208. blk_mq_put_ctx(ctx);
  209. ctx = blk_mq_get_ctx(q);
  210. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  211. blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
  212. hctx);
  213. rq = __blk_mq_alloc_request(&alloc_data, rw);
  214. ctx = alloc_data.ctx;
  215. }
  216. blk_mq_put_ctx(ctx);
  217. if (!rq)
  218. return ERR_PTR(-EWOULDBLOCK);
  219. return rq;
  220. }
  221. EXPORT_SYMBOL(blk_mq_alloc_request);
  222. static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
  223. struct blk_mq_ctx *ctx, struct request *rq)
  224. {
  225. const int tag = rq->tag;
  226. struct request_queue *q = rq->q;
  227. if (rq->cmd_flags & REQ_MQ_INFLIGHT)
  228. atomic_dec(&hctx->nr_active);
  229. rq->cmd_flags = 0;
  230. clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
  231. blk_mq_put_tag(hctx, tag, &ctx->last_tag);
  232. blk_mq_queue_exit(q);
  233. }
  234. void blk_mq_free_request(struct request *rq)
  235. {
  236. struct blk_mq_ctx *ctx = rq->mq_ctx;
  237. struct blk_mq_hw_ctx *hctx;
  238. struct request_queue *q = rq->q;
  239. ctx->rq_completed[rq_is_sync(rq)]++;
  240. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  241. __blk_mq_free_request(hctx, ctx, rq);
  242. }
  243. inline void __blk_mq_end_request(struct request *rq, int error)
  244. {
  245. blk_account_io_done(rq);
  246. if (rq->end_io) {
  247. rq->end_io(rq, error);
  248. } else {
  249. if (unlikely(blk_bidi_rq(rq)))
  250. blk_mq_free_request(rq->next_rq);
  251. blk_mq_free_request(rq);
  252. }
  253. }
  254. EXPORT_SYMBOL(__blk_mq_end_request);
  255. void blk_mq_end_request(struct request *rq, int error)
  256. {
  257. if (blk_update_request(rq, error, blk_rq_bytes(rq)))
  258. BUG();
  259. __blk_mq_end_request(rq, error);
  260. }
  261. EXPORT_SYMBOL(blk_mq_end_request);
  262. static void __blk_mq_complete_request_remote(void *data)
  263. {
  264. struct request *rq = data;
  265. rq->q->softirq_done_fn(rq);
  266. }
  267. static void blk_mq_ipi_complete_request(struct request *rq)
  268. {
  269. struct blk_mq_ctx *ctx = rq->mq_ctx;
  270. bool shared = false;
  271. int cpu;
  272. if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
  273. rq->q->softirq_done_fn(rq);
  274. return;
  275. }
  276. cpu = get_cpu();
  277. if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
  278. shared = cpus_share_cache(cpu, ctx->cpu);
  279. if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
  280. rq->csd.func = __blk_mq_complete_request_remote;
  281. rq->csd.info = rq;
  282. rq->csd.flags = 0;
  283. smp_call_function_single_async(ctx->cpu, &rq->csd);
  284. } else {
  285. rq->q->softirq_done_fn(rq);
  286. }
  287. put_cpu();
  288. }
  289. void __blk_mq_complete_request(struct request *rq)
  290. {
  291. struct request_queue *q = rq->q;
  292. if (!q->softirq_done_fn)
  293. blk_mq_end_request(rq, rq->errors);
  294. else
  295. blk_mq_ipi_complete_request(rq);
  296. }
  297. /**
  298. * blk_mq_complete_request - end I/O on a request
  299. * @rq: the request being processed
  300. *
  301. * Description:
  302. * Ends all I/O on a request. It does not handle partial completions.
  303. * The actual completion happens out-of-order, through a IPI handler.
  304. **/
  305. void blk_mq_complete_request(struct request *rq)
  306. {
  307. struct request_queue *q = rq->q;
  308. if (unlikely(blk_should_fake_timeout(q)))
  309. return;
  310. if (!blk_mark_rq_complete(rq))
  311. __blk_mq_complete_request(rq);
  312. }
  313. EXPORT_SYMBOL(blk_mq_complete_request);
  314. void blk_mq_start_request(struct request *rq)
  315. {
  316. struct request_queue *q = rq->q;
  317. trace_block_rq_issue(q, rq);
  318. rq->resid_len = blk_rq_bytes(rq);
  319. if (unlikely(blk_bidi_rq(rq)))
  320. rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
  321. blk_add_timer(rq);
  322. /*
  323. * Ensure that ->deadline is visible before set the started
  324. * flag and clear the completed flag.
  325. */
  326. smp_mb__before_atomic();
  327. /*
  328. * Mark us as started and clear complete. Complete might have been
  329. * set if requeue raced with timeout, which then marked it as
  330. * complete. So be sure to clear complete again when we start
  331. * the request, otherwise we'll ignore the completion event.
  332. */
  333. if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
  334. set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
  335. if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
  336. clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
  337. if (q->dma_drain_size && blk_rq_bytes(rq)) {
  338. /*
  339. * Make sure space for the drain appears. We know we can do
  340. * this because max_hw_segments has been adjusted to be one
  341. * fewer than the device can handle.
  342. */
  343. rq->nr_phys_segments++;
  344. }
  345. }
  346. EXPORT_SYMBOL(blk_mq_start_request);
  347. static void __blk_mq_requeue_request(struct request *rq)
  348. {
  349. struct request_queue *q = rq->q;
  350. trace_block_rq_requeue(q, rq);
  351. if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
  352. if (q->dma_drain_size && blk_rq_bytes(rq))
  353. rq->nr_phys_segments--;
  354. }
  355. }
  356. void blk_mq_requeue_request(struct request *rq)
  357. {
  358. __blk_mq_requeue_request(rq);
  359. BUG_ON(blk_queued_rq(rq));
  360. blk_mq_add_to_requeue_list(rq, true);
  361. }
  362. EXPORT_SYMBOL(blk_mq_requeue_request);
  363. static void blk_mq_requeue_work(struct work_struct *work)
  364. {
  365. struct request_queue *q =
  366. container_of(work, struct request_queue, requeue_work);
  367. LIST_HEAD(rq_list);
  368. struct request *rq, *next;
  369. unsigned long flags;
  370. spin_lock_irqsave(&q->requeue_lock, flags);
  371. list_splice_init(&q->requeue_list, &rq_list);
  372. spin_unlock_irqrestore(&q->requeue_lock, flags);
  373. list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
  374. if (!(rq->cmd_flags & REQ_SOFTBARRIER))
  375. continue;
  376. rq->cmd_flags &= ~REQ_SOFTBARRIER;
  377. list_del_init(&rq->queuelist);
  378. blk_mq_insert_request(rq, true, false, false);
  379. }
  380. while (!list_empty(&rq_list)) {
  381. rq = list_entry(rq_list.next, struct request, queuelist);
  382. list_del_init(&rq->queuelist);
  383. blk_mq_insert_request(rq, false, false, false);
  384. }
  385. /*
  386. * Use the start variant of queue running here, so that running
  387. * the requeue work will kick stopped queues.
  388. */
  389. blk_mq_start_hw_queues(q);
  390. }
  391. void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
  392. {
  393. struct request_queue *q = rq->q;
  394. unsigned long flags;
  395. /*
  396. * We abuse this flag that is otherwise used by the I/O scheduler to
  397. * request head insertation from the workqueue.
  398. */
  399. BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
  400. spin_lock_irqsave(&q->requeue_lock, flags);
  401. if (at_head) {
  402. rq->cmd_flags |= REQ_SOFTBARRIER;
  403. list_add(&rq->queuelist, &q->requeue_list);
  404. } else {
  405. list_add_tail(&rq->queuelist, &q->requeue_list);
  406. }
  407. spin_unlock_irqrestore(&q->requeue_lock, flags);
  408. }
  409. EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
  410. void blk_mq_kick_requeue_list(struct request_queue *q)
  411. {
  412. kblockd_schedule_work(&q->requeue_work);
  413. }
  414. EXPORT_SYMBOL(blk_mq_kick_requeue_list);
  415. static inline bool is_flush_request(struct request *rq,
  416. struct blk_flush_queue *fq, unsigned int tag)
  417. {
  418. return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
  419. fq->flush_rq->tag == tag);
  420. }
  421. struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
  422. {
  423. struct request *rq = tags->rqs[tag];
  424. /* mq_ctx of flush rq is always cloned from the corresponding req */
  425. struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
  426. if (!is_flush_request(rq, fq, tag))
  427. return rq;
  428. return fq->flush_rq;
  429. }
  430. EXPORT_SYMBOL(blk_mq_tag_to_rq);
  431. struct blk_mq_timeout_data {
  432. unsigned long next;
  433. unsigned int next_set;
  434. };
  435. void blk_mq_rq_timed_out(struct request *req, bool reserved)
  436. {
  437. struct blk_mq_ops *ops = req->q->mq_ops;
  438. enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
  439. /*
  440. * We know that complete is set at this point. If STARTED isn't set
  441. * anymore, then the request isn't active and the "timeout" should
  442. * just be ignored. This can happen due to the bitflag ordering.
  443. * Timeout first checks if STARTED is set, and if it is, assumes
  444. * the request is active. But if we race with completion, then
  445. * we both flags will get cleared. So check here again, and ignore
  446. * a timeout event with a request that isn't active.
  447. */
  448. if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
  449. return;
  450. if (ops->timeout)
  451. ret = ops->timeout(req, reserved);
  452. switch (ret) {
  453. case BLK_EH_HANDLED:
  454. __blk_mq_complete_request(req);
  455. break;
  456. case BLK_EH_RESET_TIMER:
  457. blk_add_timer(req);
  458. blk_clear_rq_complete(req);
  459. break;
  460. case BLK_EH_NOT_HANDLED:
  461. break;
  462. default:
  463. printk(KERN_ERR "block: bad eh return: %d\n", ret);
  464. break;
  465. }
  466. }
  467. static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
  468. struct request *rq, void *priv, bool reserved)
  469. {
  470. struct blk_mq_timeout_data *data = priv;
  471. if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
  472. return;
  473. if (time_after_eq(jiffies, rq->deadline)) {
  474. if (!blk_mark_rq_complete(rq))
  475. blk_mq_rq_timed_out(rq, reserved);
  476. } else if (!data->next_set || time_after(data->next, rq->deadline)) {
  477. data->next = rq->deadline;
  478. data->next_set = 1;
  479. }
  480. }
  481. static void blk_mq_rq_timer(unsigned long priv)
  482. {
  483. struct request_queue *q = (struct request_queue *)priv;
  484. struct blk_mq_timeout_data data = {
  485. .next = 0,
  486. .next_set = 0,
  487. };
  488. struct blk_mq_hw_ctx *hctx;
  489. int i;
  490. queue_for_each_hw_ctx(q, hctx, i) {
  491. /*
  492. * If not software queues are currently mapped to this
  493. * hardware queue, there's nothing to check
  494. */
  495. if (!hctx->nr_ctx || !hctx->tags)
  496. continue;
  497. blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
  498. }
  499. if (data.next_set) {
  500. data.next = blk_rq_timeout(round_jiffies_up(data.next));
  501. mod_timer(&q->timeout, data.next);
  502. } else {
  503. queue_for_each_hw_ctx(q, hctx, i)
  504. blk_mq_tag_idle(hctx);
  505. }
  506. }
  507. /*
  508. * Reverse check our software queue for entries that we could potentially
  509. * merge with. Currently includes a hand-wavy stop count of 8, to not spend
  510. * too much time checking for merges.
  511. */
  512. static bool blk_mq_attempt_merge(struct request_queue *q,
  513. struct blk_mq_ctx *ctx, struct bio *bio)
  514. {
  515. struct request *rq;
  516. int checked = 8;
  517. list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
  518. int el_ret;
  519. if (!checked--)
  520. break;
  521. if (!blk_rq_merge_ok(rq, bio))
  522. continue;
  523. el_ret = blk_try_merge(rq, bio);
  524. if (el_ret == ELEVATOR_BACK_MERGE) {
  525. if (bio_attempt_back_merge(q, rq, bio)) {
  526. ctx->rq_merged++;
  527. return true;
  528. }
  529. break;
  530. } else if (el_ret == ELEVATOR_FRONT_MERGE) {
  531. if (bio_attempt_front_merge(q, rq, bio)) {
  532. ctx->rq_merged++;
  533. return true;
  534. }
  535. break;
  536. }
  537. }
  538. return false;
  539. }
  540. /*
  541. * Process software queues that have been marked busy, splicing them
  542. * to the for-dispatch
  543. */
  544. static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
  545. {
  546. struct blk_mq_ctx *ctx;
  547. int i;
  548. for (i = 0; i < hctx->ctx_map.map_size; i++) {
  549. struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
  550. unsigned int off, bit;
  551. if (!bm->word)
  552. continue;
  553. bit = 0;
  554. off = i * hctx->ctx_map.bits_per_word;
  555. do {
  556. bit = find_next_bit(&bm->word, bm->depth, bit);
  557. if (bit >= bm->depth)
  558. break;
  559. ctx = hctx->ctxs[bit + off];
  560. clear_bit(bit, &bm->word);
  561. spin_lock(&ctx->lock);
  562. list_splice_tail_init(&ctx->rq_list, list);
  563. spin_unlock(&ctx->lock);
  564. bit++;
  565. } while (1);
  566. }
  567. }
  568. /*
  569. * Run this hardware queue, pulling any software queues mapped to it in.
  570. * Note that this function currently has various problems around ordering
  571. * of IO. In particular, we'd like FIFO behaviour on handling existing
  572. * items on the hctx->dispatch list. Ignore that for now.
  573. */
  574. static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  575. {
  576. struct request_queue *q = hctx->queue;
  577. struct request *rq;
  578. LIST_HEAD(rq_list);
  579. int queued;
  580. WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
  581. if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
  582. return;
  583. hctx->run++;
  584. /*
  585. * Touch any software queue that has pending entries.
  586. */
  587. flush_busy_ctxs(hctx, &rq_list);
  588. /*
  589. * If we have previous entries on our dispatch list, grab them
  590. * and stuff them at the front for more fair dispatch.
  591. */
  592. if (!list_empty_careful(&hctx->dispatch)) {
  593. spin_lock(&hctx->lock);
  594. if (!list_empty(&hctx->dispatch))
  595. list_splice_init(&hctx->dispatch, &rq_list);
  596. spin_unlock(&hctx->lock);
  597. }
  598. /*
  599. * Now process all the entries, sending them to the driver.
  600. */
  601. queued = 0;
  602. while (!list_empty(&rq_list)) {
  603. int ret;
  604. rq = list_first_entry(&rq_list, struct request, queuelist);
  605. list_del_init(&rq->queuelist);
  606. ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
  607. switch (ret) {
  608. case BLK_MQ_RQ_QUEUE_OK:
  609. queued++;
  610. continue;
  611. case BLK_MQ_RQ_QUEUE_BUSY:
  612. list_add(&rq->queuelist, &rq_list);
  613. __blk_mq_requeue_request(rq);
  614. break;
  615. default:
  616. pr_err("blk-mq: bad return on queue: %d\n", ret);
  617. case BLK_MQ_RQ_QUEUE_ERROR:
  618. rq->errors = -EIO;
  619. blk_mq_end_request(rq, rq->errors);
  620. break;
  621. }
  622. if (ret == BLK_MQ_RQ_QUEUE_BUSY)
  623. break;
  624. }
  625. if (!queued)
  626. hctx->dispatched[0]++;
  627. else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
  628. hctx->dispatched[ilog2(queued) + 1]++;
  629. /*
  630. * Any items that need requeuing? Stuff them into hctx->dispatch,
  631. * that is where we will continue on next queue run.
  632. */
  633. if (!list_empty(&rq_list)) {
  634. spin_lock(&hctx->lock);
  635. list_splice(&rq_list, &hctx->dispatch);
  636. spin_unlock(&hctx->lock);
  637. }
  638. }
  639. /*
  640. * It'd be great if the workqueue API had a way to pass
  641. * in a mask and had some smarts for more clever placement.
  642. * For now we just round-robin here, switching for every
  643. * BLK_MQ_CPU_WORK_BATCH queued items.
  644. */
  645. static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  646. {
  647. int cpu = hctx->next_cpu;
  648. if (--hctx->next_cpu_batch <= 0) {
  649. int next_cpu;
  650. next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
  651. if (next_cpu >= nr_cpu_ids)
  652. next_cpu = cpumask_first(hctx->cpumask);
  653. hctx->next_cpu = next_cpu;
  654. hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
  655. }
  656. return cpu;
  657. }
  658. void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
  659. {
  660. if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
  661. return;
  662. if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
  663. __blk_mq_run_hw_queue(hctx);
  664. else if (hctx->queue->nr_hw_queues == 1)
  665. kblockd_schedule_delayed_work(&hctx->run_work, 0);
  666. else {
  667. unsigned int cpu;
  668. cpu = blk_mq_hctx_next_cpu(hctx);
  669. kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
  670. }
  671. }
  672. void blk_mq_run_queues(struct request_queue *q, bool async)
  673. {
  674. struct blk_mq_hw_ctx *hctx;
  675. int i;
  676. queue_for_each_hw_ctx(q, hctx, i) {
  677. if ((!blk_mq_hctx_has_pending(hctx) &&
  678. list_empty_careful(&hctx->dispatch)) ||
  679. test_bit(BLK_MQ_S_STOPPED, &hctx->state))
  680. continue;
  681. preempt_disable();
  682. blk_mq_run_hw_queue(hctx, async);
  683. preempt_enable();
  684. }
  685. }
  686. EXPORT_SYMBOL(blk_mq_run_queues);
  687. void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
  688. {
  689. cancel_delayed_work(&hctx->run_work);
  690. cancel_delayed_work(&hctx->delay_work);
  691. set_bit(BLK_MQ_S_STOPPED, &hctx->state);
  692. }
  693. EXPORT_SYMBOL(blk_mq_stop_hw_queue);
  694. void blk_mq_stop_hw_queues(struct request_queue *q)
  695. {
  696. struct blk_mq_hw_ctx *hctx;
  697. int i;
  698. queue_for_each_hw_ctx(q, hctx, i)
  699. blk_mq_stop_hw_queue(hctx);
  700. }
  701. EXPORT_SYMBOL(blk_mq_stop_hw_queues);
  702. void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
  703. {
  704. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  705. preempt_disable();
  706. blk_mq_run_hw_queue(hctx, false);
  707. preempt_enable();
  708. }
  709. EXPORT_SYMBOL(blk_mq_start_hw_queue);
  710. void blk_mq_start_hw_queues(struct request_queue *q)
  711. {
  712. struct blk_mq_hw_ctx *hctx;
  713. int i;
  714. queue_for_each_hw_ctx(q, hctx, i)
  715. blk_mq_start_hw_queue(hctx);
  716. }
  717. EXPORT_SYMBOL(blk_mq_start_hw_queues);
  718. void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
  719. {
  720. struct blk_mq_hw_ctx *hctx;
  721. int i;
  722. queue_for_each_hw_ctx(q, hctx, i) {
  723. if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
  724. continue;
  725. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  726. preempt_disable();
  727. blk_mq_run_hw_queue(hctx, async);
  728. preempt_enable();
  729. }
  730. }
  731. EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
  732. static void blk_mq_run_work_fn(struct work_struct *work)
  733. {
  734. struct blk_mq_hw_ctx *hctx;
  735. hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
  736. __blk_mq_run_hw_queue(hctx);
  737. }
  738. static void blk_mq_delay_work_fn(struct work_struct *work)
  739. {
  740. struct blk_mq_hw_ctx *hctx;
  741. hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
  742. if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
  743. __blk_mq_run_hw_queue(hctx);
  744. }
  745. void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
  746. {
  747. unsigned long tmo = msecs_to_jiffies(msecs);
  748. if (hctx->queue->nr_hw_queues == 1)
  749. kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
  750. else {
  751. unsigned int cpu;
  752. cpu = blk_mq_hctx_next_cpu(hctx);
  753. kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
  754. }
  755. }
  756. EXPORT_SYMBOL(blk_mq_delay_queue);
  757. static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
  758. struct request *rq, bool at_head)
  759. {
  760. struct blk_mq_ctx *ctx = rq->mq_ctx;
  761. trace_block_rq_insert(hctx->queue, rq);
  762. if (at_head)
  763. list_add(&rq->queuelist, &ctx->rq_list);
  764. else
  765. list_add_tail(&rq->queuelist, &ctx->rq_list);
  766. blk_mq_hctx_mark_pending(hctx, ctx);
  767. }
  768. void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
  769. bool async)
  770. {
  771. struct request_queue *q = rq->q;
  772. struct blk_mq_hw_ctx *hctx;
  773. struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
  774. current_ctx = blk_mq_get_ctx(q);
  775. if (!cpu_online(ctx->cpu))
  776. rq->mq_ctx = ctx = current_ctx;
  777. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  778. spin_lock(&ctx->lock);
  779. __blk_mq_insert_request(hctx, rq, at_head);
  780. spin_unlock(&ctx->lock);
  781. if (run_queue)
  782. blk_mq_run_hw_queue(hctx, async);
  783. blk_mq_put_ctx(current_ctx);
  784. }
  785. static void blk_mq_insert_requests(struct request_queue *q,
  786. struct blk_mq_ctx *ctx,
  787. struct list_head *list,
  788. int depth,
  789. bool from_schedule)
  790. {
  791. struct blk_mq_hw_ctx *hctx;
  792. struct blk_mq_ctx *current_ctx;
  793. trace_block_unplug(q, depth, !from_schedule);
  794. current_ctx = blk_mq_get_ctx(q);
  795. if (!cpu_online(ctx->cpu))
  796. ctx = current_ctx;
  797. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  798. /*
  799. * preemption doesn't flush plug list, so it's possible ctx->cpu is
  800. * offline now
  801. */
  802. spin_lock(&ctx->lock);
  803. while (!list_empty(list)) {
  804. struct request *rq;
  805. rq = list_first_entry(list, struct request, queuelist);
  806. list_del_init(&rq->queuelist);
  807. rq->mq_ctx = ctx;
  808. __blk_mq_insert_request(hctx, rq, false);
  809. }
  810. spin_unlock(&ctx->lock);
  811. blk_mq_run_hw_queue(hctx, from_schedule);
  812. blk_mq_put_ctx(current_ctx);
  813. }
  814. static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
  815. {
  816. struct request *rqa = container_of(a, struct request, queuelist);
  817. struct request *rqb = container_of(b, struct request, queuelist);
  818. return !(rqa->mq_ctx < rqb->mq_ctx ||
  819. (rqa->mq_ctx == rqb->mq_ctx &&
  820. blk_rq_pos(rqa) < blk_rq_pos(rqb)));
  821. }
  822. void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  823. {
  824. struct blk_mq_ctx *this_ctx;
  825. struct request_queue *this_q;
  826. struct request *rq;
  827. LIST_HEAD(list);
  828. LIST_HEAD(ctx_list);
  829. unsigned int depth;
  830. list_splice_init(&plug->mq_list, &list);
  831. list_sort(NULL, &list, plug_ctx_cmp);
  832. this_q = NULL;
  833. this_ctx = NULL;
  834. depth = 0;
  835. while (!list_empty(&list)) {
  836. rq = list_entry_rq(list.next);
  837. list_del_init(&rq->queuelist);
  838. BUG_ON(!rq->q);
  839. if (rq->mq_ctx != this_ctx) {
  840. if (this_ctx) {
  841. blk_mq_insert_requests(this_q, this_ctx,
  842. &ctx_list, depth,
  843. from_schedule);
  844. }
  845. this_ctx = rq->mq_ctx;
  846. this_q = rq->q;
  847. depth = 0;
  848. }
  849. depth++;
  850. list_add_tail(&rq->queuelist, &ctx_list);
  851. }
  852. /*
  853. * If 'this_ctx' is set, we know we have entries to complete
  854. * on 'ctx_list'. Do those.
  855. */
  856. if (this_ctx) {
  857. blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
  858. from_schedule);
  859. }
  860. }
  861. static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
  862. {
  863. init_request_from_bio(rq, bio);
  864. if (blk_do_io_stat(rq))
  865. blk_account_io_start(rq, 1);
  866. }
  867. static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
  868. {
  869. return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
  870. !blk_queue_nomerges(hctx->queue);
  871. }
  872. static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
  873. struct blk_mq_ctx *ctx,
  874. struct request *rq, struct bio *bio)
  875. {
  876. if (!hctx_allow_merges(hctx)) {
  877. blk_mq_bio_to_request(rq, bio);
  878. spin_lock(&ctx->lock);
  879. insert_rq:
  880. __blk_mq_insert_request(hctx, rq, false);
  881. spin_unlock(&ctx->lock);
  882. return false;
  883. } else {
  884. struct request_queue *q = hctx->queue;
  885. spin_lock(&ctx->lock);
  886. if (!blk_mq_attempt_merge(q, ctx, bio)) {
  887. blk_mq_bio_to_request(rq, bio);
  888. goto insert_rq;
  889. }
  890. spin_unlock(&ctx->lock);
  891. __blk_mq_free_request(hctx, ctx, rq);
  892. return true;
  893. }
  894. }
  895. struct blk_map_ctx {
  896. struct blk_mq_hw_ctx *hctx;
  897. struct blk_mq_ctx *ctx;
  898. };
  899. static struct request *blk_mq_map_request(struct request_queue *q,
  900. struct bio *bio,
  901. struct blk_map_ctx *data)
  902. {
  903. struct blk_mq_hw_ctx *hctx;
  904. struct blk_mq_ctx *ctx;
  905. struct request *rq;
  906. int rw = bio_data_dir(bio);
  907. struct blk_mq_alloc_data alloc_data;
  908. if (unlikely(blk_mq_queue_enter(q))) {
  909. bio_endio(bio, -EIO);
  910. return NULL;
  911. }
  912. ctx = blk_mq_get_ctx(q);
  913. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  914. if (rw_is_sync(bio->bi_rw))
  915. rw |= REQ_SYNC;
  916. trace_block_getrq(q, bio, rw);
  917. blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
  918. hctx);
  919. rq = __blk_mq_alloc_request(&alloc_data, rw);
  920. if (unlikely(!rq)) {
  921. __blk_mq_run_hw_queue(hctx);
  922. blk_mq_put_ctx(ctx);
  923. trace_block_sleeprq(q, bio, rw);
  924. ctx = blk_mq_get_ctx(q);
  925. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  926. blk_mq_set_alloc_data(&alloc_data, q,
  927. __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
  928. rq = __blk_mq_alloc_request(&alloc_data, rw);
  929. ctx = alloc_data.ctx;
  930. hctx = alloc_data.hctx;
  931. }
  932. hctx->queued++;
  933. data->hctx = hctx;
  934. data->ctx = ctx;
  935. return rq;
  936. }
  937. /*
  938. * Multiple hardware queue variant. This will not use per-process plugs,
  939. * but will attempt to bypass the hctx queueing if we can go straight to
  940. * hardware for SYNC IO.
  941. */
  942. static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
  943. {
  944. const int is_sync = rw_is_sync(bio->bi_rw);
  945. const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
  946. struct blk_map_ctx data;
  947. struct request *rq;
  948. blk_queue_bounce(q, &bio);
  949. if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
  950. bio_endio(bio, -EIO);
  951. return;
  952. }
  953. rq = blk_mq_map_request(q, bio, &data);
  954. if (unlikely(!rq))
  955. return;
  956. if (unlikely(is_flush_fua)) {
  957. blk_mq_bio_to_request(rq, bio);
  958. blk_insert_flush(rq);
  959. goto run_queue;
  960. }
  961. if (is_sync) {
  962. int ret;
  963. blk_mq_bio_to_request(rq, bio);
  964. /*
  965. * For OK queue, we are done. For error, kill it. Any other
  966. * error (busy), just add it to our list as we previously
  967. * would have done
  968. */
  969. ret = q->mq_ops->queue_rq(data.hctx, rq, true);
  970. if (ret == BLK_MQ_RQ_QUEUE_OK)
  971. goto done;
  972. else {
  973. __blk_mq_requeue_request(rq);
  974. if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
  975. rq->errors = -EIO;
  976. blk_mq_end_request(rq, rq->errors);
  977. goto done;
  978. }
  979. }
  980. }
  981. if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
  982. /*
  983. * For a SYNC request, send it to the hardware immediately. For
  984. * an ASYNC request, just ensure that we run it later on. The
  985. * latter allows for merging opportunities and more efficient
  986. * dispatching.
  987. */
  988. run_queue:
  989. blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
  990. }
  991. done:
  992. blk_mq_put_ctx(data.ctx);
  993. }
  994. /*
  995. * Single hardware queue variant. This will attempt to use any per-process
  996. * plug for merging and IO deferral.
  997. */
  998. static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
  999. {
  1000. const int is_sync = rw_is_sync(bio->bi_rw);
  1001. const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
  1002. unsigned int use_plug, request_count = 0;
  1003. struct blk_map_ctx data;
  1004. struct request *rq;
  1005. /*
  1006. * If we have multiple hardware queues, just go directly to
  1007. * one of those for sync IO.
  1008. */
  1009. use_plug = !is_flush_fua && !is_sync;
  1010. blk_queue_bounce(q, &bio);
  1011. if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
  1012. bio_endio(bio, -EIO);
  1013. return;
  1014. }
  1015. if (use_plug && !blk_queue_nomerges(q) &&
  1016. blk_attempt_plug_merge(q, bio, &request_count))
  1017. return;
  1018. rq = blk_mq_map_request(q, bio, &data);
  1019. if (unlikely(!rq))
  1020. return;
  1021. if (unlikely(is_flush_fua)) {
  1022. blk_mq_bio_to_request(rq, bio);
  1023. blk_insert_flush(rq);
  1024. goto run_queue;
  1025. }
  1026. /*
  1027. * A task plug currently exists. Since this is completely lockless,
  1028. * utilize that to temporarily store requests until the task is
  1029. * either done or scheduled away.
  1030. */
  1031. if (use_plug) {
  1032. struct blk_plug *plug = current->plug;
  1033. if (plug) {
  1034. blk_mq_bio_to_request(rq, bio);
  1035. if (list_empty(&plug->mq_list))
  1036. trace_block_plug(q);
  1037. else if (request_count >= BLK_MAX_REQUEST_COUNT) {
  1038. blk_flush_plug_list(plug, false);
  1039. trace_block_plug(q);
  1040. }
  1041. list_add_tail(&rq->queuelist, &plug->mq_list);
  1042. blk_mq_put_ctx(data.ctx);
  1043. return;
  1044. }
  1045. }
  1046. if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
  1047. /*
  1048. * For a SYNC request, send it to the hardware immediately. For
  1049. * an ASYNC request, just ensure that we run it later on. The
  1050. * latter allows for merging opportunities and more efficient
  1051. * dispatching.
  1052. */
  1053. run_queue:
  1054. blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
  1055. }
  1056. blk_mq_put_ctx(data.ctx);
  1057. }
  1058. /*
  1059. * Default mapping to a software queue, since we use one per CPU.
  1060. */
  1061. struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
  1062. {
  1063. return q->queue_hw_ctx[q->mq_map[cpu]];
  1064. }
  1065. EXPORT_SYMBOL(blk_mq_map_queue);
  1066. static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
  1067. struct blk_mq_tags *tags, unsigned int hctx_idx)
  1068. {
  1069. struct page *page;
  1070. if (tags->rqs && set->ops->exit_request) {
  1071. int i;
  1072. for (i = 0; i < tags->nr_tags; i++) {
  1073. if (!tags->rqs[i])
  1074. continue;
  1075. set->ops->exit_request(set->driver_data, tags->rqs[i],
  1076. hctx_idx, i);
  1077. tags->rqs[i] = NULL;
  1078. }
  1079. }
  1080. while (!list_empty(&tags->page_list)) {
  1081. page = list_first_entry(&tags->page_list, struct page, lru);
  1082. list_del_init(&page->lru);
  1083. __free_pages(page, page->private);
  1084. }
  1085. kfree(tags->rqs);
  1086. blk_mq_free_tags(tags);
  1087. }
  1088. static size_t order_to_size(unsigned int order)
  1089. {
  1090. return (size_t)PAGE_SIZE << order;
  1091. }
  1092. static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
  1093. unsigned int hctx_idx)
  1094. {
  1095. struct blk_mq_tags *tags;
  1096. unsigned int i, j, entries_per_page, max_order = 4;
  1097. size_t rq_size, left;
  1098. tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
  1099. set->numa_node);
  1100. if (!tags)
  1101. return NULL;
  1102. INIT_LIST_HEAD(&tags->page_list);
  1103. tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
  1104. GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
  1105. set->numa_node);
  1106. if (!tags->rqs) {
  1107. blk_mq_free_tags(tags);
  1108. return NULL;
  1109. }
  1110. /*
  1111. * rq_size is the size of the request plus driver payload, rounded
  1112. * to the cacheline size
  1113. */
  1114. rq_size = round_up(sizeof(struct request) + set->cmd_size,
  1115. cache_line_size());
  1116. left = rq_size * set->queue_depth;
  1117. for (i = 0; i < set->queue_depth; ) {
  1118. int this_order = max_order;
  1119. struct page *page;
  1120. int to_do;
  1121. void *p;
  1122. while (left < order_to_size(this_order - 1) && this_order)
  1123. this_order--;
  1124. do {
  1125. page = alloc_pages_node(set->numa_node,
  1126. GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
  1127. this_order);
  1128. if (page)
  1129. break;
  1130. if (!this_order--)
  1131. break;
  1132. if (order_to_size(this_order) < rq_size)
  1133. break;
  1134. } while (1);
  1135. if (!page)
  1136. goto fail;
  1137. page->private = this_order;
  1138. list_add_tail(&page->lru, &tags->page_list);
  1139. p = page_address(page);
  1140. entries_per_page = order_to_size(this_order) / rq_size;
  1141. to_do = min(entries_per_page, set->queue_depth - i);
  1142. left -= to_do * rq_size;
  1143. for (j = 0; j < to_do; j++) {
  1144. tags->rqs[i] = p;
  1145. tags->rqs[i]->atomic_flags = 0;
  1146. tags->rqs[i]->cmd_flags = 0;
  1147. if (set->ops->init_request) {
  1148. if (set->ops->init_request(set->driver_data,
  1149. tags->rqs[i], hctx_idx, i,
  1150. set->numa_node)) {
  1151. tags->rqs[i] = NULL;
  1152. goto fail;
  1153. }
  1154. }
  1155. p += rq_size;
  1156. i++;
  1157. }
  1158. }
  1159. return tags;
  1160. fail:
  1161. blk_mq_free_rq_map(set, tags, hctx_idx);
  1162. return NULL;
  1163. }
  1164. static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
  1165. {
  1166. kfree(bitmap->map);
  1167. }
  1168. static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
  1169. {
  1170. unsigned int bpw = 8, total, num_maps, i;
  1171. bitmap->bits_per_word = bpw;
  1172. num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
  1173. bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
  1174. GFP_KERNEL, node);
  1175. if (!bitmap->map)
  1176. return -ENOMEM;
  1177. bitmap->map_size = num_maps;
  1178. total = nr_cpu_ids;
  1179. for (i = 0; i < num_maps; i++) {
  1180. bitmap->map[i].depth = min(total, bitmap->bits_per_word);
  1181. total -= bitmap->map[i].depth;
  1182. }
  1183. return 0;
  1184. }
  1185. static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
  1186. {
  1187. struct request_queue *q = hctx->queue;
  1188. struct blk_mq_ctx *ctx;
  1189. LIST_HEAD(tmp);
  1190. /*
  1191. * Move ctx entries to new CPU, if this one is going away.
  1192. */
  1193. ctx = __blk_mq_get_ctx(q, cpu);
  1194. spin_lock(&ctx->lock);
  1195. if (!list_empty(&ctx->rq_list)) {
  1196. list_splice_init(&ctx->rq_list, &tmp);
  1197. blk_mq_hctx_clear_pending(hctx, ctx);
  1198. }
  1199. spin_unlock(&ctx->lock);
  1200. if (list_empty(&tmp))
  1201. return NOTIFY_OK;
  1202. ctx = blk_mq_get_ctx(q);
  1203. spin_lock(&ctx->lock);
  1204. while (!list_empty(&tmp)) {
  1205. struct request *rq;
  1206. rq = list_first_entry(&tmp, struct request, queuelist);
  1207. rq->mq_ctx = ctx;
  1208. list_move_tail(&rq->queuelist, &ctx->rq_list);
  1209. }
  1210. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  1211. blk_mq_hctx_mark_pending(hctx, ctx);
  1212. spin_unlock(&ctx->lock);
  1213. blk_mq_run_hw_queue(hctx, true);
  1214. blk_mq_put_ctx(ctx);
  1215. return NOTIFY_OK;
  1216. }
  1217. static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
  1218. {
  1219. struct request_queue *q = hctx->queue;
  1220. struct blk_mq_tag_set *set = q->tag_set;
  1221. if (set->tags[hctx->queue_num])
  1222. return NOTIFY_OK;
  1223. set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
  1224. if (!set->tags[hctx->queue_num])
  1225. return NOTIFY_STOP;
  1226. hctx->tags = set->tags[hctx->queue_num];
  1227. return NOTIFY_OK;
  1228. }
  1229. static int blk_mq_hctx_notify(void *data, unsigned long action,
  1230. unsigned int cpu)
  1231. {
  1232. struct blk_mq_hw_ctx *hctx = data;
  1233. if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
  1234. return blk_mq_hctx_cpu_offline(hctx, cpu);
  1235. else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
  1236. return blk_mq_hctx_cpu_online(hctx, cpu);
  1237. return NOTIFY_OK;
  1238. }
  1239. static void blk_mq_exit_hctx(struct request_queue *q,
  1240. struct blk_mq_tag_set *set,
  1241. struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  1242. {
  1243. unsigned flush_start_tag = set->queue_depth;
  1244. blk_mq_tag_idle(hctx);
  1245. if (set->ops->exit_request)
  1246. set->ops->exit_request(set->driver_data,
  1247. hctx->fq->flush_rq, hctx_idx,
  1248. flush_start_tag + hctx_idx);
  1249. if (set->ops->exit_hctx)
  1250. set->ops->exit_hctx(hctx, hctx_idx);
  1251. blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
  1252. blk_free_flush_queue(hctx->fq);
  1253. kfree(hctx->ctxs);
  1254. blk_mq_free_bitmap(&hctx->ctx_map);
  1255. }
  1256. static void blk_mq_exit_hw_queues(struct request_queue *q,
  1257. struct blk_mq_tag_set *set, int nr_queue)
  1258. {
  1259. struct blk_mq_hw_ctx *hctx;
  1260. unsigned int i;
  1261. queue_for_each_hw_ctx(q, hctx, i) {
  1262. if (i == nr_queue)
  1263. break;
  1264. blk_mq_exit_hctx(q, set, hctx, i);
  1265. }
  1266. }
  1267. static void blk_mq_free_hw_queues(struct request_queue *q,
  1268. struct blk_mq_tag_set *set)
  1269. {
  1270. struct blk_mq_hw_ctx *hctx;
  1271. unsigned int i;
  1272. queue_for_each_hw_ctx(q, hctx, i) {
  1273. free_cpumask_var(hctx->cpumask);
  1274. kfree(hctx);
  1275. }
  1276. }
  1277. static int blk_mq_init_hctx(struct request_queue *q,
  1278. struct blk_mq_tag_set *set,
  1279. struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
  1280. {
  1281. int node;
  1282. unsigned flush_start_tag = set->queue_depth;
  1283. node = hctx->numa_node;
  1284. if (node == NUMA_NO_NODE)
  1285. node = hctx->numa_node = set->numa_node;
  1286. INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
  1287. INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
  1288. spin_lock_init(&hctx->lock);
  1289. INIT_LIST_HEAD(&hctx->dispatch);
  1290. hctx->queue = q;
  1291. hctx->queue_num = hctx_idx;
  1292. hctx->flags = set->flags;
  1293. hctx->cmd_size = set->cmd_size;
  1294. blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
  1295. blk_mq_hctx_notify, hctx);
  1296. blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
  1297. hctx->tags = set->tags[hctx_idx];
  1298. /*
  1299. * Allocate space for all possible cpus to avoid allocation at
  1300. * runtime
  1301. */
  1302. hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
  1303. GFP_KERNEL, node);
  1304. if (!hctx->ctxs)
  1305. goto unregister_cpu_notifier;
  1306. if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
  1307. goto free_ctxs;
  1308. hctx->nr_ctx = 0;
  1309. if (set->ops->init_hctx &&
  1310. set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
  1311. goto free_bitmap;
  1312. hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
  1313. if (!hctx->fq)
  1314. goto exit_hctx;
  1315. if (set->ops->init_request &&
  1316. set->ops->init_request(set->driver_data,
  1317. hctx->fq->flush_rq, hctx_idx,
  1318. flush_start_tag + hctx_idx, node))
  1319. goto free_fq;
  1320. return 0;
  1321. free_fq:
  1322. kfree(hctx->fq);
  1323. exit_hctx:
  1324. if (set->ops->exit_hctx)
  1325. set->ops->exit_hctx(hctx, hctx_idx);
  1326. free_bitmap:
  1327. blk_mq_free_bitmap(&hctx->ctx_map);
  1328. free_ctxs:
  1329. kfree(hctx->ctxs);
  1330. unregister_cpu_notifier:
  1331. blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
  1332. return -1;
  1333. }
  1334. static int blk_mq_init_hw_queues(struct request_queue *q,
  1335. struct blk_mq_tag_set *set)
  1336. {
  1337. struct blk_mq_hw_ctx *hctx;
  1338. unsigned int i;
  1339. /*
  1340. * Initialize hardware queues
  1341. */
  1342. queue_for_each_hw_ctx(q, hctx, i) {
  1343. if (blk_mq_init_hctx(q, set, hctx, i))
  1344. break;
  1345. }
  1346. if (i == q->nr_hw_queues)
  1347. return 0;
  1348. /*
  1349. * Init failed
  1350. */
  1351. blk_mq_exit_hw_queues(q, set, i);
  1352. return 1;
  1353. }
  1354. static void blk_mq_init_cpu_queues(struct request_queue *q,
  1355. unsigned int nr_hw_queues)
  1356. {
  1357. unsigned int i;
  1358. for_each_possible_cpu(i) {
  1359. struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
  1360. struct blk_mq_hw_ctx *hctx;
  1361. memset(__ctx, 0, sizeof(*__ctx));
  1362. __ctx->cpu = i;
  1363. spin_lock_init(&__ctx->lock);
  1364. INIT_LIST_HEAD(&__ctx->rq_list);
  1365. __ctx->queue = q;
  1366. /* If the cpu isn't online, the cpu is mapped to first hctx */
  1367. if (!cpu_online(i))
  1368. continue;
  1369. hctx = q->mq_ops->map_queue(q, i);
  1370. cpumask_set_cpu(i, hctx->cpumask);
  1371. hctx->nr_ctx++;
  1372. /*
  1373. * Set local node, IFF we have more than one hw queue. If
  1374. * not, we remain on the home node of the device
  1375. */
  1376. if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
  1377. hctx->numa_node = cpu_to_node(i);
  1378. }
  1379. }
  1380. static void blk_mq_map_swqueue(struct request_queue *q)
  1381. {
  1382. unsigned int i;
  1383. struct blk_mq_hw_ctx *hctx;
  1384. struct blk_mq_ctx *ctx;
  1385. queue_for_each_hw_ctx(q, hctx, i) {
  1386. cpumask_clear(hctx->cpumask);
  1387. hctx->nr_ctx = 0;
  1388. }
  1389. /*
  1390. * Map software to hardware queues
  1391. */
  1392. queue_for_each_ctx(q, ctx, i) {
  1393. /* If the cpu isn't online, the cpu is mapped to first hctx */
  1394. if (!cpu_online(i))
  1395. continue;
  1396. hctx = q->mq_ops->map_queue(q, i);
  1397. cpumask_set_cpu(i, hctx->cpumask);
  1398. ctx->index_hw = hctx->nr_ctx;
  1399. hctx->ctxs[hctx->nr_ctx++] = ctx;
  1400. }
  1401. queue_for_each_hw_ctx(q, hctx, i) {
  1402. /*
  1403. * If no software queues are mapped to this hardware queue,
  1404. * disable it and free the request entries.
  1405. */
  1406. if (!hctx->nr_ctx) {
  1407. struct blk_mq_tag_set *set = q->tag_set;
  1408. if (set->tags[i]) {
  1409. blk_mq_free_rq_map(set, set->tags[i], i);
  1410. set->tags[i] = NULL;
  1411. hctx->tags = NULL;
  1412. }
  1413. continue;
  1414. }
  1415. /*
  1416. * Initialize batch roundrobin counts
  1417. */
  1418. hctx->next_cpu = cpumask_first(hctx->cpumask);
  1419. hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
  1420. }
  1421. }
  1422. static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
  1423. {
  1424. struct blk_mq_hw_ctx *hctx;
  1425. struct request_queue *q;
  1426. bool shared;
  1427. int i;
  1428. if (set->tag_list.next == set->tag_list.prev)
  1429. shared = false;
  1430. else
  1431. shared = true;
  1432. list_for_each_entry(q, &set->tag_list, tag_set_list) {
  1433. blk_mq_freeze_queue(q);
  1434. queue_for_each_hw_ctx(q, hctx, i) {
  1435. if (shared)
  1436. hctx->flags |= BLK_MQ_F_TAG_SHARED;
  1437. else
  1438. hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
  1439. }
  1440. blk_mq_unfreeze_queue(q);
  1441. }
  1442. }
  1443. static void blk_mq_del_queue_tag_set(struct request_queue *q)
  1444. {
  1445. struct blk_mq_tag_set *set = q->tag_set;
  1446. mutex_lock(&set->tag_list_lock);
  1447. list_del_init(&q->tag_set_list);
  1448. blk_mq_update_tag_set_depth(set);
  1449. mutex_unlock(&set->tag_list_lock);
  1450. }
  1451. static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
  1452. struct request_queue *q)
  1453. {
  1454. q->tag_set = set;
  1455. mutex_lock(&set->tag_list_lock);
  1456. list_add_tail(&q->tag_set_list, &set->tag_list);
  1457. blk_mq_update_tag_set_depth(set);
  1458. mutex_unlock(&set->tag_list_lock);
  1459. }
  1460. struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
  1461. {
  1462. struct blk_mq_hw_ctx **hctxs;
  1463. struct blk_mq_ctx __percpu *ctx;
  1464. struct request_queue *q;
  1465. unsigned int *map;
  1466. int i;
  1467. ctx = alloc_percpu(struct blk_mq_ctx);
  1468. if (!ctx)
  1469. return ERR_PTR(-ENOMEM);
  1470. /*
  1471. * If a crashdump is active, then we are potentially in a very
  1472. * memory constrained environment. Limit us to 1 queue and
  1473. * 64 tags to prevent using too much memory.
  1474. */
  1475. if (is_kdump_kernel()) {
  1476. set->nr_hw_queues = 1;
  1477. set->queue_depth = min(64U, set->queue_depth);
  1478. }
  1479. hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
  1480. set->numa_node);
  1481. if (!hctxs)
  1482. goto err_percpu;
  1483. map = blk_mq_make_queue_map(set);
  1484. if (!map)
  1485. goto err_map;
  1486. for (i = 0; i < set->nr_hw_queues; i++) {
  1487. int node = blk_mq_hw_queue_to_node(map, i);
  1488. hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
  1489. GFP_KERNEL, node);
  1490. if (!hctxs[i])
  1491. goto err_hctxs;
  1492. if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
  1493. node))
  1494. goto err_hctxs;
  1495. atomic_set(&hctxs[i]->nr_active, 0);
  1496. hctxs[i]->numa_node = node;
  1497. hctxs[i]->queue_num = i;
  1498. }
  1499. q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
  1500. if (!q)
  1501. goto err_hctxs;
  1502. /*
  1503. * Init percpu_ref in atomic mode so that it's faster to shutdown.
  1504. * See blk_register_queue() for details.
  1505. */
  1506. if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
  1507. PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
  1508. goto err_mq_usage;
  1509. setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
  1510. blk_queue_rq_timeout(q, 30000);
  1511. q->nr_queues = nr_cpu_ids;
  1512. q->nr_hw_queues = set->nr_hw_queues;
  1513. q->mq_map = map;
  1514. q->queue_ctx = ctx;
  1515. q->queue_hw_ctx = hctxs;
  1516. q->mq_ops = set->ops;
  1517. q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
  1518. if (!(set->flags & BLK_MQ_F_SG_MERGE))
  1519. q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
  1520. q->sg_reserved_size = INT_MAX;
  1521. INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
  1522. INIT_LIST_HEAD(&q->requeue_list);
  1523. spin_lock_init(&q->requeue_lock);
  1524. if (q->nr_hw_queues > 1)
  1525. blk_queue_make_request(q, blk_mq_make_request);
  1526. else
  1527. blk_queue_make_request(q, blk_sq_make_request);
  1528. if (set->timeout)
  1529. blk_queue_rq_timeout(q, set->timeout);
  1530. /*
  1531. * Do this after blk_queue_make_request() overrides it...
  1532. */
  1533. q->nr_requests = set->queue_depth;
  1534. if (set->ops->complete)
  1535. blk_queue_softirq_done(q, set->ops->complete);
  1536. blk_mq_init_cpu_queues(q, set->nr_hw_queues);
  1537. if (blk_mq_init_hw_queues(q, set))
  1538. goto err_mq_usage;
  1539. mutex_lock(&all_q_mutex);
  1540. list_add_tail(&q->all_q_node, &all_q_list);
  1541. mutex_unlock(&all_q_mutex);
  1542. blk_mq_add_queue_tag_set(set, q);
  1543. blk_mq_map_swqueue(q);
  1544. return q;
  1545. err_mq_usage:
  1546. blk_cleanup_queue(q);
  1547. err_hctxs:
  1548. kfree(map);
  1549. for (i = 0; i < set->nr_hw_queues; i++) {
  1550. if (!hctxs[i])
  1551. break;
  1552. free_cpumask_var(hctxs[i]->cpumask);
  1553. kfree(hctxs[i]);
  1554. }
  1555. err_map:
  1556. kfree(hctxs);
  1557. err_percpu:
  1558. free_percpu(ctx);
  1559. return ERR_PTR(-ENOMEM);
  1560. }
  1561. EXPORT_SYMBOL(blk_mq_init_queue);
  1562. void blk_mq_free_queue(struct request_queue *q)
  1563. {
  1564. struct blk_mq_tag_set *set = q->tag_set;
  1565. blk_mq_del_queue_tag_set(q);
  1566. blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
  1567. blk_mq_free_hw_queues(q, set);
  1568. percpu_ref_exit(&q->mq_usage_counter);
  1569. free_percpu(q->queue_ctx);
  1570. kfree(q->queue_hw_ctx);
  1571. kfree(q->mq_map);
  1572. q->queue_ctx = NULL;
  1573. q->queue_hw_ctx = NULL;
  1574. q->mq_map = NULL;
  1575. mutex_lock(&all_q_mutex);
  1576. list_del_init(&q->all_q_node);
  1577. mutex_unlock(&all_q_mutex);
  1578. }
  1579. /* Basically redo blk_mq_init_queue with queue frozen */
  1580. static void blk_mq_queue_reinit(struct request_queue *q)
  1581. {
  1582. WARN_ON_ONCE(!q->mq_freeze_depth);
  1583. blk_mq_sysfs_unregister(q);
  1584. blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
  1585. /*
  1586. * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
  1587. * we should change hctx numa_node according to new topology (this
  1588. * involves free and re-allocate memory, worthy doing?)
  1589. */
  1590. blk_mq_map_swqueue(q);
  1591. blk_mq_sysfs_register(q);
  1592. }
  1593. static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
  1594. unsigned long action, void *hcpu)
  1595. {
  1596. struct request_queue *q;
  1597. /*
  1598. * Before new mappings are established, hotadded cpu might already
  1599. * start handling requests. This doesn't break anything as we map
  1600. * offline CPUs to first hardware queue. We will re-init the queue
  1601. * below to get optimal settings.
  1602. */
  1603. if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
  1604. action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
  1605. return NOTIFY_OK;
  1606. mutex_lock(&all_q_mutex);
  1607. /*
  1608. * We need to freeze and reinit all existing queues. Freezing
  1609. * involves synchronous wait for an RCU grace period and doing it
  1610. * one by one may take a long time. Start freezing all queues in
  1611. * one swoop and then wait for the completions so that freezing can
  1612. * take place in parallel.
  1613. */
  1614. list_for_each_entry(q, &all_q_list, all_q_node)
  1615. blk_mq_freeze_queue_start(q);
  1616. list_for_each_entry(q, &all_q_list, all_q_node)
  1617. blk_mq_freeze_queue_wait(q);
  1618. list_for_each_entry(q, &all_q_list, all_q_node)
  1619. blk_mq_queue_reinit(q);
  1620. list_for_each_entry(q, &all_q_list, all_q_node)
  1621. blk_mq_unfreeze_queue(q);
  1622. mutex_unlock(&all_q_mutex);
  1623. return NOTIFY_OK;
  1624. }
  1625. static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  1626. {
  1627. int i;
  1628. for (i = 0; i < set->nr_hw_queues; i++) {
  1629. set->tags[i] = blk_mq_init_rq_map(set, i);
  1630. if (!set->tags[i])
  1631. goto out_unwind;
  1632. }
  1633. return 0;
  1634. out_unwind:
  1635. while (--i >= 0)
  1636. blk_mq_free_rq_map(set, set->tags[i], i);
  1637. return -ENOMEM;
  1638. }
  1639. /*
  1640. * Allocate the request maps associated with this tag_set. Note that this
  1641. * may reduce the depth asked for, if memory is tight. set->queue_depth
  1642. * will be updated to reflect the allocated depth.
  1643. */
  1644. static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  1645. {
  1646. unsigned int depth;
  1647. int err;
  1648. depth = set->queue_depth;
  1649. do {
  1650. err = __blk_mq_alloc_rq_maps(set);
  1651. if (!err)
  1652. break;
  1653. set->queue_depth >>= 1;
  1654. if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
  1655. err = -ENOMEM;
  1656. break;
  1657. }
  1658. } while (set->queue_depth);
  1659. if (!set->queue_depth || err) {
  1660. pr_err("blk-mq: failed to allocate request map\n");
  1661. return -ENOMEM;
  1662. }
  1663. if (depth != set->queue_depth)
  1664. pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
  1665. depth, set->queue_depth);
  1666. return 0;
  1667. }
  1668. /*
  1669. * Alloc a tag set to be associated with one or more request queues.
  1670. * May fail with EINVAL for various error conditions. May adjust the
  1671. * requested depth down, if if it too large. In that case, the set
  1672. * value will be stored in set->queue_depth.
  1673. */
  1674. int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
  1675. {
  1676. if (!set->nr_hw_queues)
  1677. return -EINVAL;
  1678. if (!set->queue_depth)
  1679. return -EINVAL;
  1680. if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
  1681. return -EINVAL;
  1682. if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
  1683. return -EINVAL;
  1684. if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
  1685. pr_info("blk-mq: reduced tag depth to %u\n",
  1686. BLK_MQ_MAX_DEPTH);
  1687. set->queue_depth = BLK_MQ_MAX_DEPTH;
  1688. }
  1689. set->tags = kmalloc_node(set->nr_hw_queues *
  1690. sizeof(struct blk_mq_tags *),
  1691. GFP_KERNEL, set->numa_node);
  1692. if (!set->tags)
  1693. return -ENOMEM;
  1694. if (blk_mq_alloc_rq_maps(set))
  1695. goto enomem;
  1696. mutex_init(&set->tag_list_lock);
  1697. INIT_LIST_HEAD(&set->tag_list);
  1698. return 0;
  1699. enomem:
  1700. kfree(set->tags);
  1701. set->tags = NULL;
  1702. return -ENOMEM;
  1703. }
  1704. EXPORT_SYMBOL(blk_mq_alloc_tag_set);
  1705. void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
  1706. {
  1707. int i;
  1708. for (i = 0; i < set->nr_hw_queues; i++) {
  1709. if (set->tags[i])
  1710. blk_mq_free_rq_map(set, set->tags[i], i);
  1711. }
  1712. kfree(set->tags);
  1713. set->tags = NULL;
  1714. }
  1715. EXPORT_SYMBOL(blk_mq_free_tag_set);
  1716. int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
  1717. {
  1718. struct blk_mq_tag_set *set = q->tag_set;
  1719. struct blk_mq_hw_ctx *hctx;
  1720. int i, ret;
  1721. if (!set || nr > set->queue_depth)
  1722. return -EINVAL;
  1723. ret = 0;
  1724. queue_for_each_hw_ctx(q, hctx, i) {
  1725. ret = blk_mq_tag_update_depth(hctx->tags, nr);
  1726. if (ret)
  1727. break;
  1728. }
  1729. if (!ret)
  1730. q->nr_requests = nr;
  1731. return ret;
  1732. }
  1733. void blk_mq_disable_hotplug(void)
  1734. {
  1735. mutex_lock(&all_q_mutex);
  1736. }
  1737. void blk_mq_enable_hotplug(void)
  1738. {
  1739. mutex_unlock(&all_q_mutex);
  1740. }
  1741. static int __init blk_mq_init(void)
  1742. {
  1743. blk_mq_cpu_init();
  1744. hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
  1745. return 0;
  1746. }
  1747. subsys_initcall(blk_mq_init);