| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025 |
- /*
- * kernel/power/tuxonice_cluster.c
- *
- * Copyright (C) 2006-2014 Nigel Cunningham (nigel at tuxonice net)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines for cluster hibernation support.
- *
- * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
- *
- * How does it work?
- *
- * There is no 'master' node that tells everyone else what to do. All nodes
- * send messages to the broadcast address/port, maintain a list of peers
- * and figure out when to progress to the next step in hibernating or resuming.
- * This makes us more fault tolerant when it comes to nodes coming and going
- * (which may be more of an issue if we're hibernating when power supplies
- * are being unreliable).
- *
- * At boot time, we start a ktuxonice thread that handles communication with
- * other nodes. This node maintains a state machine that controls our progress
- * through hibernating and resuming, keeping us in step with other nodes. Nodes
- * are identified by their hw address.
- *
- * On startup, the node sends CLUSTER_PING on the configured interface's
- * broadcast address, port $toi_cluster_port (see below) and begins to listen
- * for other broadcast messages. CLUSTER_PING messages are repeated at
- * intervals of 5 minutes, with a random offset to spread traffic out.
- *
- * A hibernation cycle is initiated from any node via
- *
- * echo > /sys/power/tuxonice/do_hibernate
- *
- * and (possibily) the hibernate script. At each step of the process, the node
- * completes its work, and waits for all other nodes to signal completion of
- * their work (or timeout) before progressing to the next step.
- *
- * Request/state Action before reply Possible reply Next state
- * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP
- * HIBERNATE|NACK INIT_0
- *
- * PREP prepare_image PREP|ACK IMAGE_WRITE
- * PREP|NACK INIT_0
- * ABORT RUNNING
- *
- * IO write image IO|ACK power off
- * ABORT POST_RESUME
- *
- * (Boot time) check for image IMAGE|ACK RESUME_PREP
- * (Note 1)
- * IMAGE|NACK (Note 2)
- *
- * PREP prepare read image PREP|ACK IMAGE_READ
- * PREP|NACK (As NACK_IMAGE)
- *
- * IO read image IO|ACK POST_RESUME
- *
- * POST_RESUME thaw, post-script RUNNING
- *
- * INIT_0 init 0
- *
- * Other messages:
- *
- * - PING: Request for all other live nodes to send a PONG. Used at startup to
- * announce presence, when a node is suspected dead and periodically, in case
- * segments of the network are [un]plugged.
- *
- * - PONG: Response to a PING.
- *
- * - ABORT: Request to cancel writing an image.
- *
- * - BYE: Notification that this node is shutting down.
- *
- * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
- * nodes which are slower to start up can get state synchronised. If a node
- * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
- * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
- * must invalidate its image (if any) and boot normally.
- *
- * Note 2: May occur when one node lost power or powered off while others
- * hibernated. This node waits for others to complete resuming (ACK_READ)
- * before completing its boot, so that it appears as a fail node restarting.
- *
- * If any node has an image, then it also has a list of nodes that hibernated
- * in synchronisation with it. The node will wait for other nodes to appear
- * or timeout before beginning its restoration.
- *
- * If a node has no image, it needs to wait, in case other nodes which do have
- * an image are going to resume, but are taking longer to announce their
- * presence. For this reason, the user can specify a timeout value and a number
- * of nodes detected before we just continue. (We might want to assume in a
- * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
- * the remaining nodes will too. This might help in situations where some nodes
- * are much slower to boot, or more subject to hardware failures or such like).
- */
- #include <linux/suspend.h>
- #include <linux/module.h>
- #include <linux/moduleparam.h>
- #include <linux/if.h>
- #include <linux/rtnetlink.h>
- #include <linux/ip.h>
- #include <linux/udp.h>
- #include <linux/in.h>
- #include <linux/if_arp.h>
- #include <linux/kthread.h>
- #include <linux/wait.h>
- #include <linux/netdevice.h>
- #include <net/ip.h>
- #include "tuxonice.h"
- #include "tuxonice_modules.h"
- #include "tuxonice_sysfs.h"
- #include "tuxonice_alloc.h"
- #include "tuxonice_io.h"
- #if 1
- #define PRINTK(a, b...) pr_debug(a, ##b)
- #else
- #define PRINTK(a, b...) do { } while (0)
- #endif
- static int loopback_mode;
- static int num_local_nodes = 1;
- #define MAX_LOCAL_NODES 8
- #define SADDR (loopback_mode ? b->sid : h->saddr)
- #define MYNAME "TuxOnIce Clustering"
- enum cluster_message {
- MSG_ACK = 1,
- MSG_NACK = 2,
- MSG_PING = 4,
- MSG_ABORT = 8,
- MSG_BYE = 16,
- MSG_HIBERNATE = 32,
- MSG_IMAGE = 64,
- MSG_IO = 128,
- MSG_RUNNING = 256
- };
- static char *str_message(int message)
- {
- switch (message) {
- case 4:
- return "Ping";
- case 8:
- return "Abort";
- case 9:
- return "Abort acked";
- case 10:
- return "Abort nacked";
- case 16:
- return "Bye";
- case 17:
- return "Bye acked";
- case 18:
- return "Bye nacked";
- case 32:
- return "Hibernate request";
- case 33:
- return "Hibernate ack";
- case 34:
- return "Hibernate nack";
- case 64:
- return "Image exists?";
- case 65:
- return "Image does exist";
- case 66:
- return "No image here";
- case 128:
- return "I/O";
- case 129:
- return "I/O okay";
- case 130:
- return "I/O failed";
- case 256:
- return "Running";
- default:
- pr_err("Unrecognised message %d.\n", message);
- return "Unrecognised message (see dmesg)";
- }
- }
- #define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
- #define MSG_STATE_MASK (~MSG_ACK_MASK)
- struct node_info {
- struct list_head member_list;
- wait_queue_head_t member_events;
- spinlock_t member_list_lock;
- spinlock_t receive_lock;
- int peer_count, ignored_peer_count;
- struct toi_sysfs_data sysfs_data;
- enum cluster_message current_message;
- };
- struct node_info node_array[MAX_LOCAL_NODES];
- struct cluster_member {
- __be32 addr;
- enum cluster_message message;
- struct list_head list;
- int ignore;
- };
- #define toi_cluster_port_send 3501
- #define toi_cluster_port_recv 3502
- static struct net_device *net_dev;
- static struct toi_module_ops toi_cluster_ops;
- static int toi_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev);
- static struct packet_type toi_cluster_packet_type = {
- .type = htons(ETH_P_IP),
- .func = toi_recv,
- };
- struct toi_pkt { /* BOOTP packet format */
- struct iphdr iph; /* IP header */
- struct udphdr udph; /* UDP header */
- u8 htype; /* HW address type */
- u8 hlen; /* HW address length */
- __be32 xid; /* Transaction ID */
- __be16 secs; /* Seconds since we started */
- __be16 flags; /* Just what it says */
- u8 hw_addr[16]; /* Sender's HW address */
- u16 message; /* Message */
- unsigned long sid; /* Source ID for loopback testing */
- };
- static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
- static int added_pack;
- static int others_have_image;
- /* Key used to allow multiple clusters on the same lan */
- static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
- static char pre_hibernate_script[255] = CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
- static char post_hibernate_script[255] = CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
- /* List of cluster members */
- static unsigned long continue_delay = 5 * HZ;
- static unsigned long cluster_message_timeout = 3 * HZ;
- /* === Membership list === */
- static void print_member_info(int index)
- {
- struct cluster_member *this;
- pr_warn("==> Dumping node %d.\n", index);
- list_for_each_entry(this, &node_array[index].member_list, list)
- pr_warn("%d.%d.%d.%d last message %s. %s\n",
- NIPQUAD(this->addr),
- str_message(this->message), this->ignore ? "(Ignored)" : "");
- pr_warn("== Done ==\n");
- }
- static struct cluster_member *__find_member(int index, __be32 addr)
- {
- struct cluster_member *this;
- list_for_each_entry(this, &node_array[index].member_list, list) {
- if (this->addr != addr)
- continue;
- return this;
- }
- return NULL;
- }
- static void set_ignore(int index, __be32 addr, struct cluster_member *this)
- {
- if (this->ignore) {
- PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", index, NIPQUAD(addr));
- return;
- }
- PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", index, NIPQUAD(addr));
- this->ignore = 1;
- node_array[index].ignored_peer_count++;
- }
- static int __add_update_member(int index, __be32 addr, int message)
- {
- struct cluster_member *this;
- this = __find_member(index, addr);
- if (this) {
- if (this->message != message) {
- this->message = message;
- if ((message & MSG_NACK) &&
- (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
- set_ignore(index, addr, this);
- PRINTK("Node %d sees node %d.%d.%d.%d now sending %s.\n",
- index, NIPQUAD(addr), str_message(message));
- wake_up(&node_array[index].member_events);
- }
- return 0;
- }
- this = (struct cluster_member *)toi_kzalloc(36, sizeof(struct cluster_member), GFP_KERNEL);
- if (!this)
- return -1;
- this->addr = addr;
- this->message = message;
- this->ignore = 0;
- INIT_LIST_HEAD(&this->list);
- node_array[index].peer_count++;
- PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
- NIPQUAD(addr), str_message(message));
- if ((message & MSG_NACK) && (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
- set_ignore(index, addr, this);
- list_add_tail(&this->list, &node_array[index].member_list);
- return 1;
- }
- static int add_update_member(int index, __be32 addr, int message)
- {
- int result;
- unsigned long flags;
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- result = __add_update_member(index, addr, message);
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
- print_member_info(index);
- wake_up(&node_array[index].member_events);
- return result;
- }
- static void del_member(int index, __be32 addr)
- {
- struct cluster_member *this;
- unsigned long flags;
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- this = __find_member(index, addr);
- if (this) {
- list_del_init(&this->list);
- toi_kfree(36, this, sizeof(*this));
- node_array[index].peer_count--;
- }
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
- }
- /* === Message transmission === */
- static void toi_send_if(int message, unsigned long my_id);
- /*
- * Process received TOI packet.
- */
- static int toi_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev)
- {
- struct toi_pkt *b;
- struct iphdr *h;
- int len, result, index;
- unsigned long addr, message, ack;
- /* Perform verifications before taking the lock. */
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto drop;
- if (dev != net_dev)
- goto drop;
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (!skb)
- return NET_RX_DROP;
- if (!pskb_may_pull(skb, sizeof(struct iphdr) + sizeof(struct udphdr)))
- goto drop;
- b = (struct toi_pkt *)skb_network_header(skb);
- h = &b->iph;
- if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
- goto drop;
- /* Fragments are not supported */
- if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
- if (net_ratelimit())
- pr_err("TuxOnIce: Ignoring fragmented " "cluster message.\n");
- goto drop;
- }
- if (skb->len < ntohs(h->tot_len))
- goto drop;
- if (ip_fast_csum((char *)h, h->ihl))
- goto drop;
- if (b->udph.source != htons(toi_cluster_port_send) ||
- b->udph.dest != htons(toi_cluster_port_recv))
- goto drop;
- if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
- goto drop;
- len = ntohs(b->udph.len) - sizeof(struct udphdr);
- /* Ok the front looks good, make sure we can get at the rest. */
- if (!pskb_may_pull(skb, skb->len))
- goto drop;
- b = (struct toi_pkt *)skb_network_header(skb);
- h = &b->iph;
- addr = SADDR;
- PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
- str_message(b->message), NIPQUAD(addr));
- message = b->message & MSG_STATE_MASK;
- ack = b->message & MSG_ACK_MASK;
- for (index = 0; index < num_local_nodes; index++) {
- int new_message = node_array[index].current_message, old_message = new_message;
- if (index == SADDR || !old_message) {
- PRINTK("Ignoring node %d (offline or self).\n", index);
- continue;
- }
- /* One message at a time, please. */
- spin_lock(&node_array[index].receive_lock);
- result = add_update_member(index, SADDR, b->message);
- if (result == -1) {
- pr_warn("Failed to add new cluster member "
- NIPQUAD_FMT ".\n", NIPQUAD(addr));
- goto drop_unlock;
- }
- switch (b->message & MSG_STATE_MASK) {
- case MSG_PING:
- break;
- case MSG_ABORT:
- break;
- case MSG_BYE:
- break;
- case MSG_HIBERNATE:
- /* Can I hibernate? */
- new_message = MSG_HIBERNATE | ((index & 1) ? MSG_NACK : MSG_ACK);
- break;
- case MSG_IMAGE:
- /* Can I resume? */
- new_message = MSG_IMAGE | ((index & 1) ? MSG_NACK : MSG_ACK);
- if (new_message != old_message)
- pr_err("Setting whether I can resume to %d.\n", new_message);
- break;
- case MSG_IO:
- new_message = MSG_IO | MSG_ACK;
- break;
- case MSG_RUNNING:
- break;
- default:
- if (net_ratelimit())
- pr_err("Unrecognised TuxOnIce cluster message %d from " NIPQUAD_FMT ".",
- b->message, NIPQUAD(addr));
- };
- if (old_message != new_message) {
- node_array[index].current_message = new_message;
- pr_warn(">>> Sending new message for node " "%d.\n", index);
- toi_send_if(new_message, index);
- } else if (!ack) {
- pr_warn(">>> Resending message for node %d.\n", index);
- toi_send_if(new_message, index);
- }
- drop_unlock:
- spin_unlock(&node_array[index].receive_lock);
- };
- drop:
- /* Throw the packet out. */
- kfree_skb(skb);
- return 0;
- }
- /*
- * Send cluster message to single interface.
- */
- static void toi_send_if(int message, unsigned long my_id)
- {
- struct sk_buff *skb;
- struct toi_pkt *b;
- int hh_len = LL_RESERVED_SPACE(net_dev);
- struct iphdr *h;
- /* Allocate packet */
- skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
- if (!skb)
- return;
- skb_reserve(skb, hh_len);
- b = (struct toi_pkt *)skb_put(skb, sizeof(struct toi_pkt));
- memset(b, 0, sizeof(struct toi_pkt));
- /* Construct IP header */
- skb_reset_network_header(skb);
- h = ip_hdr(skb);
- h->version = 4;
- h->ihl = 5;
- h->tot_len = htons(sizeof(struct toi_pkt));
- h->frag_off = htons(IP_DF);
- h->ttl = 64;
- h->protocol = IPPROTO_UDP;
- h->daddr = htonl(INADDR_BROADCAST);
- h->check = ip_fast_csum((unsigned char *)h, h->ihl);
- /* Construct UDP header */
- b->udph.source = htons(toi_cluster_port_send);
- b->udph.dest = htons(toi_cluster_port_recv);
- b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
- /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
- /* Construct message */
- b->message = message;
- b->sid = my_id;
- b->htype = net_dev->type; /* can cause undefined behavior */
- b->hlen = net_dev->addr_len;
- memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
- b->secs = htons(3); /* 3 seconds */
- /* Chain packet down the line... */
- skb->dev = net_dev;
- skb->protocol = htons(ETH_P_IP);
- if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
- net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
- dev_queue_xmit(skb) < 0)
- pr_warn("E");
- }
- /* ========================================= */
- /* kTOICluster */
- static atomic_t num_cluster_threads;
- static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
- static int kTOICluster(void *data)
- {
- unsigned long my_id;
- my_id = atomic_add_return(1, &num_cluster_threads) - 1;
- node_array[my_id].current_message = (unsigned long)data;
- PRINTK("kTOICluster daemon %lu starting.\n", my_id);
- current->flags |= PF_NOFREEZE;
- while (node_array[my_id].current_message) {
- toi_send_if(node_array[my_id].current_message, my_id);
- sleep_on_timeout(&clusterd_events, cluster_message_timeout);
- PRINTK("Link state %lu is %d.\n", my_id, node_array[my_id].current_message);
- }
- toi_send_if(MSG_BYE, my_id);
- atomic_dec(&num_cluster_threads);
- wake_up(&clusterd_events);
- PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
- __set_current_state(TASK_RUNNING);
- return 0;
- }
- static void kill_clusterd(void)
- {
- int i;
- for (i = 0; i < num_local_nodes; i++) {
- if (node_array[i].current_message) {
- PRINTK("Seeking to kill clusterd %d.\n", i);
- node_array[i].current_message = 0;
- }
- }
- wait_event(clusterd_events, !atomic_read(&num_cluster_threads));
- PRINTK("All cluster daemons have exited.\n");
- }
- static int peers_not_in_message(int index, int message, int precise)
- {
- struct cluster_member *this;
- unsigned long flags;
- int result = 0;
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- list_for_each_entry(this, &node_array[index].member_list, list) {
- if (this->ignore)
- continue;
- PRINTK("Peer %d.%d.%d.%d sending %s. Seeking %s.",
- NIPQUAD(this->addr), str_message(this->message), str_message(message));
- if ((precise ? this->message : this->message & MSG_STATE_MASK) != message)
- result++;
- }
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
- PRINTK("%d peers in sought message.\n", result);
- return result;
- }
- static void reset_ignored(int index)
- {
- struct cluster_member *this;
- unsigned long flags;
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- list_for_each_entry(this, &node_array[index].member_list, list)
- this->ignore = 0;
- node_array[index].ignored_peer_count = 0;
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
- }
- static int peers_in_message(int index, int message, int precise)
- {
- return node_array[index].peer_count -
- node_array[index].ignored_peer_count - peers_not_in_message(index, message, precise);
- }
- static int time_to_continue(int index, unsigned long start, int message)
- {
- int first = peers_not_in_message(index, message, 0);
- int second = peers_in_message(index, message, 1);
- PRINTK("First part returns %d, second returns %d.\n", first, second);
- if (!first && !second) {
- PRINTK("All peers answered message %d.\n", message);
- return 1;
- }
- if (time_after(jiffies, start + continue_delay)) {
- PRINTK("Timeout reached.\n");
- return 1;
- }
- PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, start + continue_delay);
- return 0;
- }
- void toi_initiate_cluster_hibernate(void)
- {
- int result;
- unsigned long start;
- result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
- if (result)
- return;
- toi_send_if(MSG_HIBERNATE, 0);
- start = jiffies;
- wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_HIBERNATE));
- if (test_action_state(TOI_FREEZER_TEST)) {
- toi_send_if(MSG_ABORT, 0);
- start = jiffies;
- wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_RUNNING));
- do_toi_step(STEP_QUIET_CLEANUP);
- return;
- }
- toi_send_if(MSG_IO, 0);
- result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
- if (result)
- return;
- /* This code runs at resume time too! */
- if (toi_in_hibernate)
- result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
- }
- EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate);
- /* toi_cluster_print_debug_stats
- *
- * Description: Print information to be recorded for debugging purposes into a
- * buffer.
- * Arguments: buffer: Pointer to a buffer into which the debug info will be
- * printed.
- * size: Size of the buffer.
- * Returns: Number of characters written to the buffer.
- */
- static int toi_cluster_print_debug_stats(char *buffer, int size)
- {
- int len;
- if (strlen(toi_cluster_iface))
- len = scnprintf(buffer, size, "- Cluster interface is '%s'.\n", toi_cluster_iface);
- else
- len = scnprintf(buffer, size, "- Cluster support is disabled.\n");
- return len;
- }
- /* cluster_memory_needed
- *
- * Description: Tell the caller how much memory we need to operate during
- * hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
- static int toi_cluster_memory_needed(void)
- {
- return 0;
- }
- static int toi_cluster_storage_needed(void)
- {
- return 1 + strlen(toi_cluster_iface);
- }
- /* toi_cluster_save_config_info
- *
- * Description: Save informaton needed when reloading the image at resume time.
- * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE.
- * Returns: Number of bytes used for saving our data.
- */
- static int toi_cluster_save_config_info(char *buffer)
- {
- strcpy(buffer, toi_cluster_iface);
- return strlen(toi_cluster_iface + 1);
- }
- /* toi_cluster_load_config_info
- *
- * Description: Reload information needed for declustering the image at
- * resume time.
- * Arguments: Buffer: Pointer to the start of the data.
- * Size: Number of bytes that were saved.
- */
- static void toi_cluster_load_config_info(char *buffer, int size)
- {
- strncpy(toi_cluster_iface, buffer, size);
- }
- static void cluster_startup(void)
- {
- int have_image = do_check_can_resume(), i;
- unsigned long start = jiffies, initial_message;
- struct task_struct *p;
- initial_message = MSG_IMAGE;
- have_image = 1;
- for (i = 0; i < num_local_nodes; i++) {
- PRINTK("Starting ktoiclusterd %d.\n", i);
- p = kthread_create(kTOICluster, (void *)initial_message, "ktoiclusterd/%d", i);
- if (IS_ERR(p)) {
- pr_err("Failed to start ktoiclusterd.\n");
- return;
- }
- wake_up_process(p);
- }
- /* Wait for delay or someone else sending first message */
- wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_IMAGE));
- others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
- pr_warn("Continuing. I %shave an image. Peers with image: %d.",
- have_image ? "" : "don't ", others_have_image);
- if (have_image) {
- int result;
- /* Start to resume */
- pr_warn(" === Starting to resume ===\n");
- node_array[0].current_message = MSG_IO;
- toi_send_if(MSG_IO, 0);
- /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
- result = 0;
- if (!result) {
- /*
- * Atomic restore - we'll come back in the hibernation
- * path.
- */
- /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
- result = 0;
- /* do_toi_step(STEP_QUIET_CLEANUP); */
- }
- node_array[0].current_message |= MSG_NACK;
- /* For debugging - disable for real life? */
- wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_IO));
- }
- if (others_have_image) {
- /* Wait for them to resume */
- pr_warn("Waiting for other nodes to resume.\n");
- start = jiffies;
- wait_event(node_array[0].member_events, time_to_continue(0, start, MSG_RUNNING));
- if (peers_not_in_message(0, MSG_RUNNING, 0))
- pr_warn("Timed out while waiting for other " "nodes to resume.\n");
- }
- /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
- * as appropriate.
- *
- * If we don't have an image:
- * - Wait until someone else says they have one, or conditions are met
- * for continuing to boot (n machines or t seconds).
- * - If anyone has an image, wait for them to resume before continuing
- * to boot.
- *
- * If we have an image:
- * - Wait until conditions are met before continuing to resume (n
- * machines or t seconds). Send RESUME_PREP and freeze processes.
- * NACK_PREP if freezing fails (shouldn't) and follow logic for
- * us having no image above. On success, wait for [N]ACK_PREP from
- * other machines. Read image (including atomic restore) until done.
- * Wait for ACK_READ from others (should never fail). Thaw processes
- * and do post-resume. (The section after the atomic restore is done
- * via the code for hibernating).
- */
- node_array[0].current_message = MSG_RUNNING;
- }
- /* toi_cluster_open_iface
- *
- * Description: Prepare to use an interface.
- */
- static int toi_cluster_open_iface(void)
- {
- struct net_device *dev;
- rtnl_lock();
- for_each_netdev(&init_net, dev) {
- if (/* dev == &init_net.loopback_dev || */
- strcmp(dev->name, toi_cluster_iface))
- continue;
- net_dev = dev;
- break;
- }
- rtnl_unlock();
- if (!net_dev) {
- pr_err(MYNAME ": Device %s not found.\n", toi_cluster_iface);
- return -ENODEV;
- }
- dev_add_pack(&toi_cluster_packet_type);
- added_pack = 1;
- loopback_mode = (net_dev == init_net.loopback_dev);
- num_local_nodes = loopback_mode ? 8 : 1;
- PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
- loopback_mode ? "on" : "off", num_local_nodes);
- cluster_startup();
- return 0;
- }
- /* toi_cluster_close_iface
- *
- * Description: Stop using an interface.
- */
- static int toi_cluster_close_iface(void)
- {
- kill_clusterd();
- if (added_pack) {
- dev_remove_pack(&toi_cluster_packet_type);
- added_pack = 0;
- }
- return 0;
- }
- static void write_side_effect(void)
- {
- if (toi_cluster_ops.enabled) {
- toi_cluster_open_iface();
- set_toi_state(TOI_CLUSTER_MODE);
- } else {
- toi_cluster_close_iface();
- clear_toi_state(TOI_CLUSTER_MODE);
- }
- }
- static void node_write_side_effect(void)
- {
- }
- /*
- * data for our sysfs entries.
- */
- static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
- NULL),
- SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
- write_side_effect),
- SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
- SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
- 256, 0, NULL),
- SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
- 256, 0, STRING),
- SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
- 0)
- };
- /*
- * Ops structure.
- */
- static struct toi_module_ops toi_cluster_ops = {
- .type = FILTER_MODULE,
- .name = "Cluster",
- .directory = "cluster",
- .module = THIS_MODULE,
- .memory_needed = toi_cluster_memory_needed,
- .print_debug_info = toi_cluster_print_debug_stats,
- .save_config_info = toi_cluster_save_config_info,
- .load_config_info = toi_cluster_load_config_info,
- .storage_needed = toi_cluster_storage_needed,
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data),
- };
- /* ---- Registration ---- */
- #ifdef MODULE
- static int __init toi_cluster_init(void)
- #else
- static int toi_cluster_init(void)
- #endif
- {
- int temp = toi_register_module(&toi_cluster_ops), i;
- struct kobject *kobj = toi_cluster_ops.dir_kobj;
- for (i = 0; i < MAX_LOCAL_NODES; i++) {
- node_array[i].current_message = 0;
- INIT_LIST_HEAD(&node_array[i].member_list);
- init_waitqueue_head(&node_array[i].member_events);
- spin_lock_init(&node_array[i].member_list_lock);
- spin_lock_init(&node_array[i].receive_lock);
- /* Set up sysfs entry */
- node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
- sizeof(node_array[i].sysfs_data.attr.name),
- GFP_KERNEL);
- sprintf((char *)node_array[i].sysfs_data.attr.name, "node_%d", i);
- node_array[i].sysfs_data.attr.mode = SYSFS_RW;
- node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
- node_array[i].sysfs_data.flags = 0;
- node_array[i].sysfs_data.data.integer.variable =
- (int *)&node_array[i].current_message;
- node_array[i].sysfs_data.data.integer.minimum = 0;
- node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
- node_array[i].sysfs_data.write_side_effect = node_write_side_effect;
- toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
- }
- toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
- if (toi_cluster_ops.enabled)
- toi_cluster_open_iface();
- return temp;
- }
- #ifdef MODULE
- static void __exit toi_cluster_exit(void)
- #else
- static void toi_cluster_exit(void)
- #endif
- {
- int i;
- toi_cluster_close_iface();
- for (i = 0; i < MAX_LOCAL_NODES; i++)
- toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, &node_array[i].sysfs_data);
- toi_unregister_module(&toi_cluster_ops);
- }
- static int __init toi_cluster_iface_setup(char *iface)
- {
- toi_cluster_ops.enabled = (*iface && strcmp(iface, "off"));
- if (toi_cluster_ops.enabled)
- strncpy(toi_cluster_iface, iface, strlen(iface));
- }
- __setup("toi_cluster=", toi_cluster_iface_setup);
- #ifdef MODULE
- MODULE_LICENSE("GPL");
- module_init(toi_cluster_init);
- module_exit(toi_cluster_exit);
- MODULE_AUTHOR("Nigel Cunningham");
- MODULE_DESCRIPTION("Cluster Support for TuxOnIce");
- #endif
|