tuxonice_cluster.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025
  1. /*
  2. * kernel/power/tuxonice_cluster.c
  3. *
  4. * Copyright (C) 2006-2014 Nigel Cunningham (nigel at tuxonice net)
  5. *
  6. * This file is released under the GPLv2.
  7. *
  8. * This file contains routines for cluster hibernation support.
  9. *
  10. * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
  11. *
  12. * How does it work?
  13. *
  14. * There is no 'master' node that tells everyone else what to do. All nodes
  15. * send messages to the broadcast address/port, maintain a list of peers
  16. * and figure out when to progress to the next step in hibernating or resuming.
  17. * This makes us more fault tolerant when it comes to nodes coming and going
  18. * (which may be more of an issue if we're hibernating when power supplies
  19. * are being unreliable).
  20. *
  21. * At boot time, we start a ktuxonice thread that handles communication with
  22. * other nodes. This node maintains a state machine that controls our progress
  23. * through hibernating and resuming, keeping us in step with other nodes. Nodes
  24. * are identified by their hw address.
  25. *
  26. * On startup, the node sends CLUSTER_PING on the configured interface's
  27. * broadcast address, port $toi_cluster_port (see below) and begins to listen
  28. * for other broadcast messages. CLUSTER_PING messages are repeated at
  29. * intervals of 5 minutes, with a random offset to spread traffic out.
  30. *
  31. * A hibernation cycle is initiated from any node via
  32. *
  33. * echo > /sys/power/tuxonice/do_hibernate
  34. *
  35. * and (possibily) the hibernate script. At each step of the process, the node
  36. * completes its work, and waits for all other nodes to signal completion of
  37. * their work (or timeout) before progressing to the next step.
  38. *
  39. * Request/state Action before reply Possible reply Next state
  40. * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP
  41. * HIBERNATE|NACK INIT_0
  42. *
  43. * PREP prepare_image PREP|ACK IMAGE_WRITE
  44. * PREP|NACK INIT_0
  45. * ABORT RUNNING
  46. *
  47. * IO write image IO|ACK power off
  48. * ABORT POST_RESUME
  49. *
  50. * (Boot time) check for image IMAGE|ACK RESUME_PREP
  51. * (Note 1)
  52. * IMAGE|NACK (Note 2)
  53. *
  54. * PREP prepare read image PREP|ACK IMAGE_READ
  55. * PREP|NACK (As NACK_IMAGE)
  56. *
  57. * IO read image IO|ACK POST_RESUME
  58. *
  59. * POST_RESUME thaw, post-script RUNNING
  60. *
  61. * INIT_0 init 0
  62. *
  63. * Other messages:
  64. *
  65. * - PING: Request for all other live nodes to send a PONG. Used at startup to
  66. * announce presence, when a node is suspected dead and periodically, in case
  67. * segments of the network are [un]plugged.
  68. *
  69. * - PONG: Response to a PING.
  70. *
  71. * - ABORT: Request to cancel writing an image.
  72. *
  73. * - BYE: Notification that this node is shutting down.
  74. *
  75. * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
  76. * nodes which are slower to start up can get state synchronised. If a node
  77. * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
  78. * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
  79. * must invalidate its image (if any) and boot normally.
  80. *
  81. * Note 2: May occur when one node lost power or powered off while others
  82. * hibernated. This node waits for others to complete resuming (ACK_READ)
  83. * before completing its boot, so that it appears as a fail node restarting.
  84. *
  85. * If any node has an image, then it also has a list of nodes that hibernated
  86. * in synchronisation with it. The node will wait for other nodes to appear
  87. * or timeout before beginning its restoration.
  88. *
  89. * If a node has no image, it needs to wait, in case other nodes which do have
  90. * an image are going to resume, but are taking longer to announce their
  91. * presence. For this reason, the user can specify a timeout value and a number
  92. * of nodes detected before we just continue. (We might want to assume in a
  93. * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
  94. * the remaining nodes will too. This might help in situations where some nodes
  95. * are much slower to boot, or more subject to hardware failures or such like).
  96. */
  97. #include <linux/suspend.h>
  98. #include <linux/module.h>
  99. #include <linux/moduleparam.h>
  100. #include <linux/if.h>
  101. #include <linux/rtnetlink.h>
  102. #include <linux/ip.h>
  103. #include <linux/udp.h>
  104. #include <linux/in.h>
  105. #include <linux/if_arp.h>
  106. #include <linux/kthread.h>
  107. #include <linux/wait.h>
  108. #include <linux/netdevice.h>
  109. #include <net/ip.h>
  110. #include "tuxonice.h"
  111. #include "tuxonice_modules.h"
  112. #include "tuxonice_sysfs.h"
  113. #include "tuxonice_alloc.h"
  114. #include "tuxonice_io.h"
  115. #if 1
  116. #define PRINTK(a, b...) pr_debug(a, ##b)
  117. #else
  118. #define PRINTK(a, b...) do { } while (0)
  119. #endif
  120. static int loopback_mode;
  121. static int num_local_nodes = 1;
  122. #define MAX_LOCAL_NODES 8
  123. #define SADDR (loopback_mode ? b->sid : h->saddr)
  124. #define MYNAME "TuxOnIce Clustering"
  125. enum cluster_message {
  126. MSG_ACK = 1,
  127. MSG_NACK = 2,
  128. MSG_PING = 4,
  129. MSG_ABORT = 8,
  130. MSG_BYE = 16,
  131. MSG_HIBERNATE = 32,
  132. MSG_IMAGE = 64,
  133. MSG_IO = 128,
  134. MSG_RUNNING = 256
  135. };
  136. static char *str_message(int message)
  137. {
  138. switch (message) {
  139. case 4:
  140. return "Ping";
  141. case 8:
  142. return "Abort";
  143. case 9:
  144. return "Abort acked";
  145. case 10:
  146. return "Abort nacked";
  147. case 16:
  148. return "Bye";
  149. case 17:
  150. return "Bye acked";
  151. case 18:
  152. return "Bye nacked";
  153. case 32:
  154. return "Hibernate request";
  155. case 33:
  156. return "Hibernate ack";
  157. case 34:
  158. return "Hibernate nack";
  159. case 64:
  160. return "Image exists?";
  161. case 65:
  162. return "Image does exist";
  163. case 66:
  164. return "No image here";
  165. case 128:
  166. return "I/O";
  167. case 129:
  168. return "I/O okay";
  169. case 130:
  170. return "I/O failed";
  171. case 256:
  172. return "Running";
  173. default:
  174. pr_err("Unrecognised message %d.\n", message);
  175. return "Unrecognised message (see dmesg)";
  176. }
  177. }
  178. #define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
  179. #define MSG_STATE_MASK (~MSG_ACK_MASK)
  180. struct node_info {
  181. struct list_head member_list;
  182. wait_queue_head_t member_events;
  183. spinlock_t member_list_lock;
  184. spinlock_t receive_lock;
  185. int peer_count, ignored_peer_count;
  186. struct toi_sysfs_data sysfs_data;
  187. enum cluster_message current_message;
  188. };
  189. struct node_info node_array[MAX_LOCAL_NODES];
  190. struct cluster_member {
  191. __be32 addr;
  192. enum cluster_message message;
  193. struct list_head list;
  194. int ignore;
  195. };
  196. #define toi_cluster_port_send 3501
  197. #define toi_cluster_port_recv 3502
  198. static struct net_device *net_dev;
  199. static struct toi_module_ops toi_cluster_ops;
  200. static int toi_recv(struct sk_buff *skb, struct net_device *dev,
  201. struct packet_type *pt, struct net_device *orig_dev);
  202. static struct packet_type toi_cluster_packet_type = {
  203. .type = htons(ETH_P_IP),
  204. .func = toi_recv,
  205. };
  206. struct toi_pkt { /* BOOTP packet format */
  207. struct iphdr iph; /* IP header */
  208. struct udphdr udph; /* UDP header */
  209. u8 htype; /* HW address type */
  210. u8 hlen; /* HW address length */
  211. __be32 xid; /* Transaction ID */
  212. __be16 secs; /* Seconds since we started */
  213. __be16 flags; /* Just what it says */
  214. u8 hw_addr[16]; /* Sender's HW address */
  215. u16 message; /* Message */
  216. unsigned long sid; /* Source ID for loopback testing */
  217. };
  218. static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
  219. static int added_pack;
  220. static int others_have_image;
  221. /* Key used to allow multiple clusters on the same lan */
  222. static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
  223. static char pre_hibernate_script[255] = CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
  224. static char post_hibernate_script[255] = CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
  225. /* List of cluster members */
  226. static unsigned long continue_delay = 5 * HZ;
  227. static unsigned long cluster_message_timeout = 3 * HZ;
  228. /* === Membership list === */
  229. static void print_member_info(int index)
  230. {
  231. struct cluster_member *this;
  232. pr_warn("==> Dumping node %d.\n", index);
  233. list_for_each_entry(this, &node_array[index].member_list, list)
  234. pr_warn("%d.%d.%d.%d last message %s. %s\n",
  235. NIPQUAD(this->addr),
  236. str_message(this->message), this->ignore ? "(Ignored)" : "");
  237. pr_warn("== Done ==\n");
  238. }
  239. static struct cluster_member *__find_member(int index, __be32 addr)
  240. {
  241. struct cluster_member *this;
  242. list_for_each_entry(this, &node_array[index].member_list, list) {
  243. if (this->addr != addr)
  244. continue;
  245. return this;
  246. }
  247. return NULL;
  248. }
  249. static void set_ignore(int index, __be32 addr, struct cluster_member *this)
  250. {
  251. if (this->ignore) {
  252. PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", index, NIPQUAD(addr));
  253. return;
  254. }
  255. PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", index, NIPQUAD(addr));
  256. this->ignore = 1;
  257. node_array[index].ignored_peer_count++;
  258. }
  259. static int __add_update_member(int index, __be32 addr, int message)
  260. {
  261. struct cluster_member *this;
  262. this = __find_member(index, addr);
  263. if (this) {
  264. if (this->message != message) {
  265. this->message = message;
  266. if ((message & MSG_NACK) &&
  267. (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
  268. set_ignore(index, addr, this);
  269. PRINTK("Node %d sees node %d.%d.%d.%d now sending %s.\n",
  270. index, NIPQUAD(addr), str_message(message));
  271. wake_up(&node_array[index].member_events);
  272. }
  273. return 0;
  274. }
  275. this = (struct cluster_member *)toi_kzalloc(36, sizeof(struct cluster_member), GFP_KERNEL);
  276. if (!this)
  277. return -1;
  278. this->addr = addr;
  279. this->message = message;
  280. this->ignore = 0;
  281. INIT_LIST_HEAD(&this->list);
  282. node_array[index].peer_count++;
  283. PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
  284. NIPQUAD(addr), str_message(message));
  285. if ((message & MSG_NACK) && (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
  286. set_ignore(index, addr, this);
  287. list_add_tail(&this->list, &node_array[index].member_list);
  288. return 1;
  289. }
  290. static int add_update_member(int index, __be32 addr, int message)
  291. {
  292. int result;
  293. unsigned long flags;
  294. spin_lock_irqsave(&node_array[index].member_list_lock, flags);
  295. result = __add_update_member(index, addr, message);
  296. spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
  297. print_member_info(index);
  298. wake_up(&node_array[index].member_events);
  299. return result;
  300. }
  301. static void del_member(int index, __be32 addr)
  302. {
  303. struct cluster_member *this;
  304. unsigned long flags;
  305. spin_lock_irqsave(&node_array[index].member_list_lock, flags);
  306. this = __find_member(index, addr);
  307. if (this) {
  308. list_del_init(&this->list);
  309. toi_kfree(36, this, sizeof(*this));
  310. node_array[index].peer_count--;
  311. }
  312. spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
  313. }
  314. /* === Message transmission === */
  315. static void toi_send_if(int message, unsigned long my_id);
  316. /*
  317. * Process received TOI packet.
  318. */
  319. static int toi_recv(struct sk_buff *skb, struct net_device *dev,
  320. struct packet_type *pt, struct net_device *orig_dev)
  321. {
  322. struct toi_pkt *b;
  323. struct iphdr *h;
  324. int len, result, index;
  325. unsigned long addr, message, ack;
  326. /* Perform verifications before taking the lock. */
  327. if (skb->pkt_type == PACKET_OTHERHOST)
  328. goto drop;
  329. if (dev != net_dev)
  330. goto drop;
  331. skb = skb_share_check(skb, GFP_ATOMIC);
  332. if (!skb)
  333. return NET_RX_DROP;
  334. if (!pskb_may_pull(skb, sizeof(struct iphdr) + sizeof(struct udphdr)))
  335. goto drop;
  336. b = (struct toi_pkt *)skb_network_header(skb);
  337. h = &b->iph;
  338. if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
  339. goto drop;
  340. /* Fragments are not supported */
  341. if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
  342. if (net_ratelimit())
  343. pr_err("TuxOnIce: Ignoring fragmented " "cluster message.\n");
  344. goto drop;
  345. }
  346. if (skb->len < ntohs(h->tot_len))
  347. goto drop;
  348. if (ip_fast_csum((char *)h, h->ihl))
  349. goto drop;
  350. if (b->udph.source != htons(toi_cluster_port_send) ||
  351. b->udph.dest != htons(toi_cluster_port_recv))
  352. goto drop;
  353. if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
  354. goto drop;
  355. len = ntohs(b->udph.len) - sizeof(struct udphdr);
  356. /* Ok the front looks good, make sure we can get at the rest. */
  357. if (!pskb_may_pull(skb, skb->len))
  358. goto drop;
  359. b = (struct toi_pkt *)skb_network_header(skb);
  360. h = &b->iph;
  361. addr = SADDR;
  362. PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
  363. str_message(b->message), NIPQUAD(addr));
  364. message = b->message & MSG_STATE_MASK;
  365. ack = b->message & MSG_ACK_MASK;
  366. for (index = 0; index < num_local_nodes; index++) {
  367. int new_message = node_array[index].current_message, old_message = new_message;
  368. if (index == SADDR || !old_message) {
  369. PRINTK("Ignoring node %d (offline or self).\n", index);
  370. continue;
  371. }
  372. /* One message at a time, please. */
  373. spin_lock(&node_array[index].receive_lock);
  374. result = add_update_member(index, SADDR, b->message);
  375. if (result == -1) {
  376. pr_warn("Failed to add new cluster member "
  377. NIPQUAD_FMT ".\n", NIPQUAD(addr));
  378. goto drop_unlock;
  379. }
  380. switch (b->message & MSG_STATE_MASK) {
  381. case MSG_PING:
  382. break;
  383. case MSG_ABORT:
  384. break;
  385. case MSG_BYE:
  386. break;
  387. case MSG_HIBERNATE:
  388. /* Can I hibernate? */
  389. new_message = MSG_HIBERNATE | ((index & 1) ? MSG_NACK : MSG_ACK);
  390. break;
  391. case MSG_IMAGE:
  392. /* Can I resume? */
  393. new_message = MSG_IMAGE | ((index & 1) ? MSG_NACK : MSG_ACK);
  394. if (new_message != old_message)
  395. pr_err("Setting whether I can resume to %d.\n", new_message);
  396. break;
  397. case MSG_IO:
  398. new_message = MSG_IO | MSG_ACK;
  399. break;
  400. case MSG_RUNNING:
  401. break;
  402. default:
  403. if (net_ratelimit())
  404. pr_err("Unrecognised TuxOnIce cluster message %d from " NIPQUAD_FMT ".",
  405. b->message, NIPQUAD(addr));
  406. };
  407. if (old_message != new_message) {
  408. node_array[index].current_message = new_message;
  409. pr_warn(">>> Sending new message for node " "%d.\n", index);
  410. toi_send_if(new_message, index);
  411. } else if (!ack) {
  412. pr_warn(">>> Resending message for node %d.\n", index);
  413. toi_send_if(new_message, index);
  414. }
  415. drop_unlock:
  416. spin_unlock(&node_array[index].receive_lock);
  417. };
  418. drop:
  419. /* Throw the packet out. */
  420. kfree_skb(skb);
  421. return 0;
  422. }
  423. /*
  424. * Send cluster message to single interface.
  425. */
  426. static void toi_send_if(int message, unsigned long my_id)
  427. {
  428. struct sk_buff *skb;
  429. struct toi_pkt *b;
  430. int hh_len = LL_RESERVED_SPACE(net_dev);
  431. struct iphdr *h;
  432. /* Allocate packet */
  433. skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
  434. if (!skb)
  435. return;
  436. skb_reserve(skb, hh_len);
  437. b = (struct toi_pkt *)skb_put(skb, sizeof(struct toi_pkt));
  438. memset(b, 0, sizeof(struct toi_pkt));
  439. /* Construct IP header */
  440. skb_reset_network_header(skb);
  441. h = ip_hdr(skb);
  442. h->version = 4;
  443. h->ihl = 5;
  444. h->tot_len = htons(sizeof(struct toi_pkt));
  445. h->frag_off = htons(IP_DF);
  446. h->ttl = 64;
  447. h->protocol = IPPROTO_UDP;
  448. h->daddr = htonl(INADDR_BROADCAST);
  449. h->check = ip_fast_csum((unsigned char *)h, h->ihl);
  450. /* Construct UDP header */
  451. b->udph.source = htons(toi_cluster_port_send);
  452. b->udph.dest = htons(toi_cluster_port_recv);
  453. b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
  454. /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
  455. /* Construct message */
  456. b->message = message;
  457. b->sid = my_id;
  458. b->htype = net_dev->type; /* can cause undefined behavior */
  459. b->hlen = net_dev->addr_len;
  460. memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
  461. b->secs = htons(3); /* 3 seconds */
  462. /* Chain packet down the line... */
  463. skb->dev = net_dev;
  464. skb->protocol = htons(ETH_P_IP);
  465. if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
  466. net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
  467. dev_queue_xmit(skb) < 0)
  468. pr_warn("E");
  469. }
  470. /* ========================================= */
  471. /* kTOICluster */
  472. static atomic_t num_cluster_threads;
  473. static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
  474. static int kTOICluster(void *data)
  475. {
  476. unsigned long my_id;
  477. my_id = atomic_add_return(1, &num_cluster_threads) - 1;
  478. node_array[my_id].current_message = (unsigned long)data;
  479. PRINTK("kTOICluster daemon %lu starting.\n", my_id);
  480. current->flags |= PF_NOFREEZE;
  481. while (node_array[my_id].current_message) {
  482. toi_send_if(node_array[my_id].current_message, my_id);
  483. sleep_on_timeout(&clusterd_events, cluster_message_timeout);
  484. PRINTK("Link state %lu is %d.\n", my_id, node_array[my_id].current_message);
  485. }
  486. toi_send_if(MSG_BYE, my_id);
  487. atomic_dec(&num_cluster_threads);
  488. wake_up(&clusterd_events);
  489. PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
  490. __set_current_state(TASK_RUNNING);
  491. return 0;
  492. }
  493. static void kill_clusterd(void)
  494. {
  495. int i;
  496. for (i = 0; i < num_local_nodes; i++) {
  497. if (node_array[i].current_message) {
  498. PRINTK("Seeking to kill clusterd %d.\n", i);
  499. node_array[i].current_message = 0;
  500. }
  501. }
  502. wait_event(clusterd_events, !atomic_read(&num_cluster_threads));
  503. PRINTK("All cluster daemons have exited.\n");
  504. }
  505. static int peers_not_in_message(int index, int message, int precise)
  506. {
  507. struct cluster_member *this;
  508. unsigned long flags;
  509. int result = 0;
  510. spin_lock_irqsave(&node_array[index].member_list_lock, flags);
  511. list_for_each_entry(this, &node_array[index].member_list, list) {
  512. if (this->ignore)
  513. continue;
  514. PRINTK("Peer %d.%d.%d.%d sending %s. Seeking %s.",
  515. NIPQUAD(this->addr), str_message(this->message), str_message(message));
  516. if ((precise ? this->message : this->message & MSG_STATE_MASK) != message)
  517. result++;
  518. }
  519. spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
  520. PRINTK("%d peers in sought message.\n", result);
  521. return result;
  522. }
  523. static void reset_ignored(int index)
  524. {
  525. struct cluster_member *this;
  526. unsigned long flags;
  527. spin_lock_irqsave(&node_array[index].member_list_lock, flags);
  528. list_for_each_entry(this, &node_array[index].member_list, list)
  529. this->ignore = 0;
  530. node_array[index].ignored_peer_count = 0;
  531. spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
  532. }
  533. static int peers_in_message(int index, int message, int precise)
  534. {
  535. return node_array[index].peer_count -
  536. node_array[index].ignored_peer_count - peers_not_in_message(index, message, precise);
  537. }
  538. static int time_to_continue(int index, unsigned long start, int message)
  539. {
  540. int first = peers_not_in_message(index, message, 0);
  541. int second = peers_in_message(index, message, 1);
  542. PRINTK("First part returns %d, second returns %d.\n", first, second);
  543. if (!first && !second) {
  544. PRINTK("All peers answered message %d.\n", message);
  545. return 1;
  546. }
  547. if (time_after(jiffies, start + continue_delay)) {
  548. PRINTK("Timeout reached.\n");
  549. return 1;
  550. }
  551. PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, start + continue_delay);
  552. return 0;
  553. }
  554. void toi_initiate_cluster_hibernate(void)
  555. {
  556. int result;
  557. unsigned long start;
  558. result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
  559. if (result)
  560. return;
  561. toi_send_if(MSG_HIBERNATE, 0);
  562. start = jiffies;
  563. wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_HIBERNATE));
  564. if (test_action_state(TOI_FREEZER_TEST)) {
  565. toi_send_if(MSG_ABORT, 0);
  566. start = jiffies;
  567. wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_RUNNING));
  568. do_toi_step(STEP_QUIET_CLEANUP);
  569. return;
  570. }
  571. toi_send_if(MSG_IO, 0);
  572. result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
  573. if (result)
  574. return;
  575. /* This code runs at resume time too! */
  576. if (toi_in_hibernate)
  577. result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
  578. }
  579. EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate);
  580. /* toi_cluster_print_debug_stats
  581. *
  582. * Description: Print information to be recorded for debugging purposes into a
  583. * buffer.
  584. * Arguments: buffer: Pointer to a buffer into which the debug info will be
  585. * printed.
  586. * size: Size of the buffer.
  587. * Returns: Number of characters written to the buffer.
  588. */
  589. static int toi_cluster_print_debug_stats(char *buffer, int size)
  590. {
  591. int len;
  592. if (strlen(toi_cluster_iface))
  593. len = scnprintf(buffer, size, "- Cluster interface is '%s'.\n", toi_cluster_iface);
  594. else
  595. len = scnprintf(buffer, size, "- Cluster support is disabled.\n");
  596. return len;
  597. }
  598. /* cluster_memory_needed
  599. *
  600. * Description: Tell the caller how much memory we need to operate during
  601. * hibernate/resume.
  602. * Returns: Unsigned long. Maximum number of bytes of memory required for
  603. * operation.
  604. */
  605. static int toi_cluster_memory_needed(void)
  606. {
  607. return 0;
  608. }
  609. static int toi_cluster_storage_needed(void)
  610. {
  611. return 1 + strlen(toi_cluster_iface);
  612. }
  613. /* toi_cluster_save_config_info
  614. *
  615. * Description: Save informaton needed when reloading the image at resume time.
  616. * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE.
  617. * Returns: Number of bytes used for saving our data.
  618. */
  619. static int toi_cluster_save_config_info(char *buffer)
  620. {
  621. strcpy(buffer, toi_cluster_iface);
  622. return strlen(toi_cluster_iface + 1);
  623. }
  624. /* toi_cluster_load_config_info
  625. *
  626. * Description: Reload information needed for declustering the image at
  627. * resume time.
  628. * Arguments: Buffer: Pointer to the start of the data.
  629. * Size: Number of bytes that were saved.
  630. */
  631. static void toi_cluster_load_config_info(char *buffer, int size)
  632. {
  633. strncpy(toi_cluster_iface, buffer, size);
  634. }
  635. static void cluster_startup(void)
  636. {
  637. int have_image = do_check_can_resume(), i;
  638. unsigned long start = jiffies, initial_message;
  639. struct task_struct *p;
  640. initial_message = MSG_IMAGE;
  641. have_image = 1;
  642. for (i = 0; i < num_local_nodes; i++) {
  643. PRINTK("Starting ktoiclusterd %d.\n", i);
  644. p = kthread_create(kTOICluster, (void *)initial_message, "ktoiclusterd/%d", i);
  645. if (IS_ERR(p)) {
  646. pr_err("Failed to start ktoiclusterd.\n");
  647. return;
  648. }
  649. wake_up_process(p);
  650. }
  651. /* Wait for delay or someone else sending first message */
  652. wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_IMAGE));
  653. others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
  654. pr_warn("Continuing. I %shave an image. Peers with image: %d.",
  655. have_image ? "" : "don't ", others_have_image);
  656. if (have_image) {
  657. int result;
  658. /* Start to resume */
  659. pr_warn(" === Starting to resume ===\n");
  660. node_array[0].current_message = MSG_IO;
  661. toi_send_if(MSG_IO, 0);
  662. /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
  663. result = 0;
  664. if (!result) {
  665. /*
  666. * Atomic restore - we'll come back in the hibernation
  667. * path.
  668. */
  669. /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
  670. result = 0;
  671. /* do_toi_step(STEP_QUIET_CLEANUP); */
  672. }
  673. node_array[0].current_message |= MSG_NACK;
  674. /* For debugging - disable for real life? */
  675. wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_IO));
  676. }
  677. if (others_have_image) {
  678. /* Wait for them to resume */
  679. pr_warn("Waiting for other nodes to resume.\n");
  680. start = jiffies;
  681. wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_RUNNING));
  682. if (peers_not_in_message(0, MSG_RUNNING, 0))
  683. pr_warn("Timed out while waiting for other " "nodes to resume.\n");
  684. }
  685. /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
  686. * as appropriate.
  687. *
  688. * If we don't have an image:
  689. * - Wait until someone else says they have one, or conditions are met
  690. * for continuing to boot (n machines or t seconds).
  691. * - If anyone has an image, wait for them to resume before continuing
  692. * to boot.
  693. *
  694. * If we have an image:
  695. * - Wait until conditions are met before continuing to resume (n
  696. * machines or t seconds). Send RESUME_PREP and freeze processes.
  697. * NACK_PREP if freezing fails (shouldn't) and follow logic for
  698. * us having no image above. On success, wait for [N]ACK_PREP from
  699. * other machines. Read image (including atomic restore) until done.
  700. * Wait for ACK_READ from others (should never fail). Thaw processes
  701. * and do post-resume. (The section after the atomic restore is done
  702. * via the code for hibernating).
  703. */
  704. node_array[0].current_message = MSG_RUNNING;
  705. }
  706. /* toi_cluster_open_iface
  707. *
  708. * Description: Prepare to use an interface.
  709. */
  710. static int toi_cluster_open_iface(void)
  711. {
  712. struct net_device *dev;
  713. rtnl_lock();
  714. for_each_netdev(&init_net, dev) {
  715. if (/* dev == &init_net.loopback_dev || */
  716. strcmp(dev->name, toi_cluster_iface))
  717. continue;
  718. net_dev = dev;
  719. break;
  720. }
  721. rtnl_unlock();
  722. if (!net_dev) {
  723. pr_err(MYNAME ": Device %s not found.\n", toi_cluster_iface);
  724. return -ENODEV;
  725. }
  726. dev_add_pack(&toi_cluster_packet_type);
  727. added_pack = 1;
  728. loopback_mode = (net_dev == init_net.loopback_dev);
  729. num_local_nodes = loopback_mode ? 8 : 1;
  730. PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
  731. loopback_mode ? "on" : "off", num_local_nodes);
  732. cluster_startup();
  733. return 0;
  734. }
  735. /* toi_cluster_close_iface
  736. *
  737. * Description: Stop using an interface.
  738. */
  739. static int toi_cluster_close_iface(void)
  740. {
  741. kill_clusterd();
  742. if (added_pack) {
  743. dev_remove_pack(&toi_cluster_packet_type);
  744. added_pack = 0;
  745. }
  746. return 0;
  747. }
  748. static void write_side_effect(void)
  749. {
  750. if (toi_cluster_ops.enabled) {
  751. toi_cluster_open_iface();
  752. set_toi_state(TOI_CLUSTER_MODE);
  753. } else {
  754. toi_cluster_close_iface();
  755. clear_toi_state(TOI_CLUSTER_MODE);
  756. }
  757. }
  758. static void node_write_side_effect(void)
  759. {
  760. }
  761. /*
  762. * data for our sysfs entries.
  763. */
  764. static struct toi_sysfs_data sysfs_params[] = {
  765. SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
  766. NULL),
  767. SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
  768. write_side_effect),
  769. SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
  770. SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
  771. 256, 0, NULL),
  772. SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
  773. 256, 0, STRING),
  774. SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
  775. 0)
  776. };
  777. /*
  778. * Ops structure.
  779. */
  780. static struct toi_module_ops toi_cluster_ops = {
  781. .type = FILTER_MODULE,
  782. .name = "Cluster",
  783. .directory = "cluster",
  784. .module = THIS_MODULE,
  785. .memory_needed = toi_cluster_memory_needed,
  786. .print_debug_info = toi_cluster_print_debug_stats,
  787. .save_config_info = toi_cluster_save_config_info,
  788. .load_config_info = toi_cluster_load_config_info,
  789. .storage_needed = toi_cluster_storage_needed,
  790. .sysfs_data = sysfs_params,
  791. .num_sysfs_entries = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data),
  792. };
  793. /* ---- Registration ---- */
  794. #ifdef MODULE
  795. static int __init toi_cluster_init(void)
  796. #else
  797. static int toi_cluster_init(void)
  798. #endif
  799. {
  800. int temp = toi_register_module(&toi_cluster_ops), i;
  801. struct kobject *kobj = toi_cluster_ops.dir_kobj;
  802. for (i = 0; i < MAX_LOCAL_NODES; i++) {
  803. node_array[i].current_message = 0;
  804. INIT_LIST_HEAD(&node_array[i].member_list);
  805. init_waitqueue_head(&node_array[i].member_events);
  806. spin_lock_init(&node_array[i].member_list_lock);
  807. spin_lock_init(&node_array[i].receive_lock);
  808. /* Set up sysfs entry */
  809. node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
  810. sizeof(node_array[i].sysfs_data.attr.name),
  811. GFP_KERNEL);
  812. sprintf((char *)node_array[i].sysfs_data.attr.name, "node_%d", i);
  813. node_array[i].sysfs_data.attr.mode = SYSFS_RW;
  814. node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
  815. node_array[i].sysfs_data.flags = 0;
  816. node_array[i].sysfs_data.data.integer.variable =
  817. (int *)&node_array[i].current_message;
  818. node_array[i].sysfs_data.data.integer.minimum = 0;
  819. node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
  820. node_array[i].sysfs_data.write_side_effect = node_write_side_effect;
  821. toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
  822. }
  823. toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
  824. if (toi_cluster_ops.enabled)
  825. toi_cluster_open_iface();
  826. return temp;
  827. }
  828. #ifdef MODULE
  829. static void __exit toi_cluster_exit(void)
  830. #else
  831. static void toi_cluster_exit(void)
  832. #endif
  833. {
  834. int i;
  835. toi_cluster_close_iface();
  836. for (i = 0; i < MAX_LOCAL_NODES; i++)
  837. toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, &node_array[i].sysfs_data);
  838. toi_unregister_module(&toi_cluster_ops);
  839. }
  840. static int __init toi_cluster_iface_setup(char *iface)
  841. {
  842. toi_cluster_ops.enabled = (*iface && strcmp(iface, "off"));
  843. if (toi_cluster_ops.enabled)
  844. strncpy(toi_cluster_iface, iface, strlen(iface));
  845. }
  846. __setup("toi_cluster=", toi_cluster_iface_setup);
  847. #ifdef MODULE
  848. MODULE_LICENSE("GPL");
  849. module_init(toi_cluster_init);
  850. module_exit(toi_cluster_exit);
  851. MODULE_AUTHOR("Nigel Cunningham");
  852. MODULE_DESCRIPTION("Cluster Support for TuxOnIce");
  853. #endif