% The Chicago Journal of Theoretical Computer Science, Volume 1997, Article 4
% Published by MIT Press, Cambridge, Massachusetts USA
% Copyright 1997 by Massachusetts Institute of Technology
%
\newtoks\rstyle
\rstyle={cjlook}
%
\documentclass[v1.1,published]{cjstruct}
%
% Local style resources for this article.
%
\usestyleresource{amstext}
%
% Local macro definitions for this article.
%
\catcode`\@=11
\ifx\nopagebreakbeginpar\undeftex
\newcommand{\nopagebreakbeginpar}{\@beginparpenalty10000}
\fi
\ifx\nopagebreakendpar\undeftex
\newcommand{\nopagebreakendpar}{\@endparpenalty10000}
\fi
\ifx\nopagebreakinteritem\undeftex
\newcommand{\nopagebreakinteritem}{\@itempenalty10000}
\fi
\catcode`\@=12
%
\newcommand{\prooffontshape}{\fontshape{n}\selectfont}
%
\def\topo{\mathcal{T}\!.}
\def\tupsub{\setminus\!\!\setminus\,}
%
\def\registered{\({}^{\ooalign%
{\hfil\raise.07ex\hbox{{\tiny\textup{R}}}%
\hfil\crcr\scriptsize\mathhexbox20D\cr}}\)}
%
% end of local macro definitions for this article
%
\begin{document}
%
%STANDARD
%
\begin{articleinfo}
%
\publisher{The MIT Press}
%
\publishercomment{%
{\Large
\begin{center}
Volume 1997, Article 4\\
\textit{19 December 1997}
\end{center}
}
\par\noindent
ISSN 1073--0486\@. MIT Press Journals, Five Cambridge Center, Cambridge, MA
02142-1493 USA; (617)253-2889;
\emph{journals-orders@mit.edu, journals-info@mit.edu}\@.
Published one article at a time in \LaTeX\ source form on the
Internet\@. Pagination varies from copy to copy\@. For more
information and other articles see:
\begin{itemize}
\item \emph{http://mitpress.mit.edu/CJTCS/}
\item \emph{http://www.cs.uchicago.edu/publications/cjtcs/}
\item \emph{ftp://mitpress.mit.edu/pub/CJTCS}
\item \emph{ftp://cs.uchicago.edu/pub/publications/cjtcs}
\end{itemize}
\sentence
\par\noindent
The \emph{Chicago Journal of Theoretical Computer Science} is
abstracted or indexed in \emph{Research Alert,\registered
SciSearch,\registered Current Contents\registered/Engineering
Computing \& Technology}, and \emph{CompuMath Citation Index\@.\registered}
}
%
\copyrightstmt{%
\copyright 1997 The Massachusetts Institute of Technology\@.
Subscribers are licensed to use journal articles in a variety of
ways, limited only as required to insure fair attribution to authors
and the journal, and to prohibit use in a competing commercial
product\@. See the journal's World Wide Web site for further
details\@. Address inquiries to the Subsidiary Rights Manager, MIT
Press Journals; (617)253-2864;
\emph{journals-rights@mit.edu}\@.\pagebreak[2]
}
%
\journal{Chicago\nolinebreak[1] Journal of Theoretical Computer\nolinebreak[1] Science}
\journalcomment{%
The \emph{Chicago Journal of Theoretical Computer Science}
is a peer-reviewed scholarly journal in theoretical computer
science\@. The journal is committed to providing a forum for
significant results on theoretical aspects of all topics in computer
science\@.
\par
\begin{trivlist}
\item[\emph{Editor in chief:}] Janos Simon
\item[\emph{Consulting editors:}]
Joseph Halpern, Stuart A.\ Kurtz, Raimund Seidel
\item[\emph{Editors:}]
\begin{tabular}[t]{lll}
Martin Abadi & Greg Frederickson & John Mitchell \\
Pankaj Agarwal & Andrew Goldberg & Ketan Mulmuley \\
Eric Allender & Georg Gottlob & Gil Neiger \\
Tetsuo Asano & Vassos Hadzilacos & David Peleg \\
Laszl\'o Babai & Juris Hartmanis & Andrew Pitts \\
Eric Bach & Maurice Herlihy & James Royer \\
Stephen Brookes & Ted Herman & Alan Selman \\
Jin-Yi Cai & Stephen Homer & Nir Shavit \\
Anne Condon & Neil Immerman & Eva Tardos \\
Cynthia Dwork & Howard Karloff & Sam Toueg \\
David Eppstein & Philip Klein & Moshe Vardi \\
Ronald Fagin & Phokion Kolaitis & Jennifer Welch \\
Lance Fortnow & Stephen Mahaney & Pierre Wolper \\
Steven Fortune & Michael Merritt
\end{tabular}
\vspace{1ex}
\item[\emph{Managing editor:}] Michael J.\ O'Donnell
\vspace{1ex}
\item[\emph{Electronic mail:}] \emph{chicago-journal@cs.uchicago.edu}
\end{trivlist}
\sentence
}
%
\bannerfile{cjtcs-banner.tex}
%
\volume{1997}
%
\articleid{4}
%
\selfcitation{
@article{cj97-04,
author={Shlomi Dolev and Ted Herman},
title={Superstabilizing Protocols for Dynamic Distributed Systems},
journal={Chicago Journal of Theoretical Computer Science},
volume={1997},
number={4},
publisher={MIT Press},
month={December},
year={1997}
}
}
%
\begin{retrievalinfo}
\end{retrievalinfo}
%
\title{Superstabilizing Protocols for Dynamic Distributed
Systems\footnote{ A preliminary version of this work was presented at
the 2nd Workshop on Self-Stabilizing Systems, Las Vegas, Nevada, May 1995.}}
%
\shorttitle{Superstabilizing Protocols}
\editorname{}
%
\editorcomment{%
\samepage%
}
%
\begin{authorinfo}
%
% Correction to error in cjstruct.sty version 1.1.
%
\def\fixversion{1.1}
\ifx\cjversion\fixversion
\renewcommand{\printname}[1]{
\global\authorstring={#1}
\global\authorstrue
\ifinfo\item Printed name: #1\fi
}
\fi
%
% End of correction.
%
\printname{Shlomi Dolev}
\collname{}{Dolev, Shlomi}
\begin{institutioninfo}
\instname{Ben-Gurion University of the Negev}
\department{Department of Mathematics and Computer Science}
\address{}{}{Beer-Sheva}{Israel}{84105}
\end{institutioninfo}
\email{dolev@cs.bgu.ac.il}
\end{authorinfo}
\begin{authorinfo}
\printname{Ted Herman}
\collname{}{Herman, Ted}
\begin{institutioninfo}
\instname{University of Iowa}
\department{Department of Computer Science}
\address{}{Iowa City}{IA}{USA}{52242}
\end{institutioninfo}
\email{herman@cs.uiowa.edu}
\end{authorinfo}
%
\shortauthors{Dolev and Herman}
%
\support{Part of Shlomi Dolev's research was supported
by TAMU Engineering Excellence funds,
by NSF Presidential Young Investigator Award CCR-9396098,
and by the Israeli Ministry of Science and Arts Grant 6756195.
\apar{}
Ted Herman's research was supported in part by
the Netherlands Organization for Scientific Research (NWO)
under contract NF 62-376 (NFI project ALADDIN:
Algorithmic Aspects of Parallel and Distributed Systems).
}
%
\begin{editinfo}
\submitted{}{}{1997}\reported{}{}{1997}\published{19}{12}{1997}
\end{editinfo}
%
\end{articleinfo}
%
\begin{articletext}
%
% STANDARD
\begin{abstract}
\apar{Abstract-1}
Two aspects of distributed-protocol reliability are
the ability to recover from transient faults and
the ability to function in a dynamic environment\@.
Approaches for both of these aspects have been separately
developed, but have revealed drawbacks when applied to environments
with both transient faults and dynamic changes\@.
This paper introduces definitions and methods for addressing
both concerns in system design\@.
%
\apar{Abstract-2}
A protocol is \emph{superstabilizing} if it is
(1) self-stabilizing, meaning that it is guaranteed
to respond to an arbitrary transient fault by
eventually satisfying and maintaining a \emph{legitimacy} predicate,
and it is (2) guaranteed to satisfy a \emph{passage} predicate
at all times when the system undergoes topology changes starting
from a legitimate state\@. The passage predicate is typically a safety
property that should hold while the protocol makes progress toward
reestablishing legitimacy following a topology change\@.
%
\apar{Abstract-3}
Specific contributions of the paper include:
the definition of the class of superstabilizing
protocols; metrics for evaluating superstabilizing protocols;
superstabilizing protocols for coloring and spanning tree
construction; a general method for
converting self-stabilizing protocols into superstabilizing ones;
and a generalized form of a self-stabilizing topology
update protocol which may have useful applications for other
research\@.
\end{abstract}
\asection{1}{Introduction} \label{introduction}
%
\apar{1-1}
The most general technique enabling a system to tolerate arbitrary
transient faults is \emph{self-stabilization}:
a protocol is self-stabilizing if, in response to any transient fault,
it converges to a legitimate
state in finite time\@. The characterization of legitimate states,
given by a legitimacy predicate, specifies the protocol's function\@.
Such protocols are generally evaluated by studying the efficiency of
convergence, which entails bounding the time of convergence
to a legitimate state following a transient fault\@. Other aspects
of convergence, for instance, safety properties, are of less interest,
because arbitrary transient faults can falsify any nontrivial
safety property\@.
%
\apar{1-2}
The model of a \emph{dynamic} system is one where
communication links and processors may fail and recover during normal
operation\@. Protocols for dynamic systems are
designed to contend with such failures and recoveries without
global reinitialization\@.
These protocols consider only
global states that are reachable from a predefined initial
state under a \emph{constrained sequence of failures};
under such an assumption, the
protocols attempt to handle failures with as
few adjustments as possible\@.
Thus, whereas self-stabilization research largely ignores the
behavior of protocols between the time of a transient fault
and restoration to a legitimate state, dynamic protocols make
guarantees about behavior at all times (during the period between a
failure event and the completion of necessary adjustments)\@.
\asubsection{1.1}{Superstabilization}
%
\apar{1.1-1}
Superstabilizing protocols combine benefits of both self-stabilizing
and dynamic protocols\@. We retain the idea of a legitimate state,
but partition illegitimate states into two classes, depending on
whether or not they satisfy a \emph{passage} predicate\@.
Roughly speaking, a protocol is superstabilizing if: (1) it
is self-stabilizing, and (2) when it is started in a
legitimate state and a topology change occurs, the passage predicate
holds and continues to hold until the protocol reaches a
legitimate state\@.
%
\apar{1.1-2}
Superstabilization is specified with respect to the
class of topology changes for which a desired passage
predicate can be maintained during convergence to legitimacy\@.
Since a legitimacy predicate is dependent on
system topology, a topology change will typically falsify
legitimacy\@. The passage predicate must
therefore be weaker than legitimacy, but strong enough to be
useful; ideally, the passage predicate should be the strongest
predicate that holds when a legitimate state undergoes
any of a class of
topology change events\@. One example for a passage
predicate is the existence
of at most one token in a mutual exclusion task:
whereas in a legitimate state exactly one token
exists, a processor crash could eliminate the token yet
not falsify the passage predicate\@.
Similarly, for the leader election task, the passage
predicate could specify that at most one leader exists\@.
A useful passage predicate can thus represent a
critical safety aspect of the task in question\@.
A superstabilizing protocol is more stable\footnote
{This stronger property of stability motivates the
term \emph{superstabilizing}\@.} than
a protocol that is only self-stabilizing: at a legitimate
state, the system maintains stability with respect to
the passage predicate, even when perturbed by a topology change\@.
%
\apar{1.1-3}
Superstabilizing protocols are evaluated in several ways\@.
Of interest are the worst-case convergence time, i.e., the
time required to establish a legitimate state following
either a transient fault or a topology change, and the
scope of the convergence in terms of how much of the network's
data must be changed as a result of convergence\@.
We classify superstabilizing protocols by the following
complexity measures:
\begin{itemize}
\item stabilization time---the maximum amount of time it takes
for the protocol to reach a legitimate state;
\item superstabilization time---the maximum amount of time
it takes for a protocol starting from a legitimate state, followed
by a single topology change, to reach a legitimate state; and
\item adjustment measure---the maximum number of processors
that must change their local states, upon a topology change from a
legitimate state, so that the protocol is in a legitimate state\@.
\end{itemize}
\asubsection{1.2}{Background and Motivation}
%
\apar{1.2-1}
Many distributed protocols have
been designed to cope with
continuous dynamic changes \cite{cj97-04-01,cj97-04-06,cj97-04-09,cj97-04-07}\@.
These protocols make certain assumptions about
the behavior of processors and links during failure and recovery;
for instance, most of them do not consider the possibility
of processor crashes, and they assume that every
corrupted message is identified and discarded\@.
If failures are frequent, these restrictive assumptions
can be too optimistic\@.
%
\apar{1.2-2}
A number of researchers \cite{cj97-04-13,cj97-04-16,cj97-04-10,cj97-04-05} suggest a
self-stabilizing approach for dealing with dynamic systems\@.
In a self-stabilizing approach, a state following a topology change is seen as an
inconsistent state from which the system will converge to
a state consistent with the new topology\@. Although self-stabilization
can handle dynamic systems, the primary goal of a self-stabilizing
protocol is to recover from transient faults. This
view has influenced the design and analysis of self-stabilizing
protocols; for instance, for a correct self-stabilizing protocol,
there are no restrictions on the behavior of the system
\emph{during} the convergence period---the only guarantee is
convergence to a legitimate state\@.
%
\apar{1.2-3}
When considering the treatment of a dynamic system, self-stabilization
differs from the dynamic protocols cited above in the way that topology
changes are modeled\@. The dynamic protocols assume that topology
changes are \emph{events\/} signaling changes on incident processors\@.
Self-stabilizing protocols take a necessarily more conservative
approach that is entirely state-based: a topology change results
in a new state, from which convergence to a legitimacy is guaranteed,
with no dependence on a signal\@. Yet when the system
is in a legitimate state and a fault happens to be a detected event,
can the behavior during the convergence be constrained
to satisfy some desired safety property? For instance, is
it possible in these situations for the protocol to maintain
a ``nearly legitimate'' state during convergence?
%
\apar{1.2-4}
The issue can be motivated by considering the problem of maintaining
a spanning tree in a network\@. Suppose the spanning tree is
used for virtual circuits between processors in the network\@.
When a tree link fails, the spanning tree becomes disconnected;
yet virtual circuits entirely within a connected component can continue
to operate\@. We would like to restore the
system to have a spanning tree so that
existing virtual circuits in the connected components
remain operating; thus a least-impact legitimate state
would be realized by simply choosing a link to connect the
components\@.
%
\apar{1.2-5}
One thesis of this paper is that self-stabilizing protocols can be
designed with dynamic change in mind to improve response\@.
Self-stabilizing protocols proposed for dynamic systems
do not use the fact that a processor can
detect that it is recovering
following a crash; consequently, there is no possibility
of executing an ``initialization'' procedure during this
recovery\@.\footnote{Moreover, some systems \emph{do\/}
provide a ``start'' signal\@.
Self-stabilizing protocols do not specify any
special starting state or action
upon receiving such a signal\@. Note that a
computation invoked by a start signal resembles
the case of a dynamic system starting with all processors
simultaneously recovering from a crash\@.}
A key observation for
this paper is that a topology change is \emph{usually\/}
a detectable event, and in cases where a topology change
is not detected, we use self-stabilization as a fallback
mechanism to deal with the change\@. The remainder of the paper
illustrates aspects of superstabilization with selected protocols,
and a general method that converts self-stabilizing protocols into
superstabilizing protocols\@.
\asection{2}{Dynamic Systems} \label{model}
%
\apar{2-1}
A system is represented by a graph where processors are nodes and links are
(undirected) edges\@. An edge between two processors exists if and only if the two
processors are \emph{neighbors}; processors may only communicate if they
are neighbors\@. Each processor has a unique identifier taken from a totally
ordered domain\@. We use \(p\), \(q\), and \(r\) to denote processor identifiers\@.
Processors communicate using registers, however, application of the model
to a message-passing system is intended; we outline an
implementation of the register model in terms
of message-based constructions in \cite{cj97-04-14,cj97-04-12}\@.
%
\apar{2-2}
Associated with each processor \(p\) are
code, local variables, a program counter, a shared register,
and an input variable
\(N_p\), which is a list of processors \(q\) that are
neighbors of \(p\)\@. Invariantly, neighborhoods
satisfy \(p\not\in{N_p}\) and \(q\in N_p \Leftrightarrow p\in N_q\)\@.
A processor can read from and write to its own
shared register, but may only read shared registers belonging
to neighboring processors\@.
The code of a processor
is a sequential program; a program counter is associated with
each processor\@. An \emph{atomic step\/}
of a processor (in the sequel referred to as steps) consists of
the execution of one statement in a program\@.
In one atomic step, a processor performs some
internal computation and, at most, one register operation\@.
A processor has two possible register operations, {\sf read} and {\sf write}\@.
%
\apar{2-3}
Our model specifies that a step consists of a statement execution; we
have in mind a conventional instruction-execution architecture, where each
statement corresponds to some low-level code\@. However,
to make the presentation of protocols concise,
we give descriptions at a higher level in terms of programs
with assignment statements and control structures (\textbf{forall}, \textbf{do},
etc.); it should be understood that these descriptions can be resolved into
lower-level programs where statements translate into atomic steps\@.
In the protocol presentations, we also use the convention
that advancing the program counter beyond the last statement of
a program returns the program counter to the program's first
statement; thus each program takes the form of an infinite loop\@.
%
\apar{2-4}
Local processor variables are of two types: variables used for
computations, and \emph{field image\/} variables\@. The former are
denoted using unsubscripted variable names such as \(x\), \(y\), and \(A\)\@.
The field image variables refer to fields of registers; these
variables are subscripted to refer to the register location. For
instance, \(e_p\) refers to a field of processor \(p\)'s register, and
\(y_q\) refers to a field of processor \(q\)'s register\@. Program
statements that assign to field images or use field images in
calculations are not register operations: the field image is
essentially a cache of an actual register field\@.
A processor \(p\)'s \(\textsf{read}(q)\) operation,
defined for \(q\in N_p\), atomically reads the register
of processor \(q\) and assigns all corresponding field images (e.g., \(e_q\),
\(y_q\), etc.) at processor \(p\)\@. A \textsf{write} operation atomically
sets all fields of \(p\)'s register to current image values\@.
For convenience, we also permit a local calculation to
specify field image(s) with a \textsf{write} statement;
for instance, \(\textsf{write}(e_p := 1)\) assigns to field image
\(e_p\) and then writes to \(p\)'s register\@.
%
\apar{2-5}
The state of a processor \(p\) fully describes the values of its
local variables, program counter, shared register, and its
neighborhood \(N_p\) (although a processor cannot change its
neighborhood, it is convenient to include \(N_p\) in the state
of \(p\) for subsequent definitions)\@. In the sequel, we
occasionally refer to the state of a processor as a \emph{local state}\@.
The state of the system is a vector of states of all processors;
a system state is called a \emph{global state}\@.
For a global state \(\sigma\) and a processor \(q\), let \(\sigma[q]\)
denote the local state of \(q\) in state \(\sigma\)\@.
A \emph{computation} is a sequence of global
states \(\Theta=(\theta_1,\theta_2,\dots)\) such that for \(i=1,2,\dots\)
the global state \(\theta_{i+1}\) is reached from \(\theta_i\) by a single
step of some processor\@. A \emph{fair} computation is a computation
that is either finite or contains infinitely many steps
of each (noncrashed) processor\@.
%
\apar{2-6}
We write \(\sigma\vdash\mathcal{Q}\) to denote that
global state \(\sigma\) satisfies a predicate \(\cal Q\)\@.
Suppose \(\cal P\) is a predicate that encodes some property of interest
for our system; for instance, \(\cal P\) can specify that exactly one
processor has a token for mutual exclusion\@.
A legitimacy predicate \(\cal L\) is typically specified with
respect to \(\cal P\): whereas \(\cal P\) is concerned with
those aspects of a state to determine whether or not the property
of interest holds, the predicate \(\cal L\)
specifies permissible values of all register fields,
program counters, and local variables so that \(\cal P\) remains invariantly
true in a computation\@.
%
\apar{2-7}
A system \emph{topology} is a specific system configuration of links and
processors\@. Each processor can determine the current status
of its neighborhood from its local state (via \(N_p\));
thus the system topology can be extracted by a program
from a global state of the system\@.
Let \(\topo\alpha\) denote
the topology for a given global state \(\alpha\)\@.
Dynamic changes transform the system from one
topology \(\topo\alpha\) to another topology \(\topo\beta\) by
changing neighborhoods and possibly removing or adding processors\@.
%
\apar{2-8}
A topology change \emph{event} is the removal or addition
of a single neighborhood component (link, processor, or processor
and subset of its incident links),
together with the execution
of certain atomic steps specified in the sequel\@.
Topology changes involving numerous links and processors can
be modeled by a sequence of single change events\@.
The crash of processor \(p\) is
denoted \(\textsf{crash}_p\); the recovery of processor \(p\) is
denoted \(\textsf{recov}_p\); \(\textsf{crash}_{pq}\) and \(\textsf{recov}_{pq}\)
denote link failure and recovery events, respectively\@. In our model, a
processor crash and a link crash are indistinguishable to a
neighbor of the event: if \(p\) and \(q\) are neighbors and \(\textsf{crash}_p\)
occurs, then we model this event by \(\textsf{crash}_{pq}\) with respect
to reasoning about processor \(q\)\@. Similarly, a \(\textsf{recov}_p\) event
is indistinguishable from a \(\textsf{recov}_{pq}\) event with respect to
reasoning about a neighbor \(q\) of \(p\)\@.
We say that a topology change event \(\cal E\) is \emph{incident}
on \(p\) if \(\cal E\) is \(\textsf{recov}_p\),
\(\textsf{crash}_{pq}\), or \(\textsf{recov}_{pq}\)\@.
We extend this definition to be symmetric: \(\cal E\) is incident
on \(p\) if and only if \(p\) is incident on \(\cal E\)\@.
%
\apar{2-9}
For most of the protocols presented in this paper,
each processor is equipped with an \emph{interrupt statement,\/}
which is a statement concerned with adjusting to topology change\@.
A topology change \(\cal E\) incident on \(p\)
causes the following to atomically
occur at \(p\): the input variable \(N_p\)
is changed to reflect \(\cal E\),
the interrupt statement of the protocol is atomically executed, and
\(p\)'s program counter is set to the first statement of its program\@.
Note that if \(\cal E\)
is incident on numerous processors, then all incident neighborhoods
change to reflect \(\cal E\), and all
processors execute the interrupt statement atomically with
event \(\cal E\)\@. Thus the transition by \(\cal E\) from
\(\topo\alpha\) to \(\topo\beta\) changes more than the neighborhoods;
states \(\alpha\) and \(\beta\) also differ in the
local states of processors incident on \(\cal E\),
due to execution of interrupt statements at these processors\@.
%
\apar{2-10}
A \emph{trajectory} is a sequence of global states in which
each segment is either a fair computation or a sequence of
topology change events\@. For purposes of reasoning
about self-stabilization, we follow the standard method of
proving properties of computations, not trajectories\@.
Dynamic change is handled indirectly in this approach: following
an event \(\cal E\), if there are no further changes for a sufficiently
long period, the protocol self-stabilizes in the computation
following \(\cal E\) in the trajectory\@.
\asection{3}{The Superstabilization Paradigm} \label{superstabilization}
%
\apar{3-1}
The definition of superstabilization takes the
idea of a ``typical'' change into account by specifying a
class \(\Lambda\) of topology change events\@. A self-stabilizing
protocol is superstabilizing with respect to events of type \(\Lambda\),
if starting from a legitimate state followed by a \(\Lambda\)-event,
the passage predicate holds continuously:
%
\begin{alabsubtext}{Definition}{1}{} \label{superstab}
A protocol \(P\) is \emph{superstabilizing} with respect to
\(\Lambda\) if and only if \(P\) is self-stabilizing, and for every
trajectory \(\Phi\) beginning at a legitimate state and containing
a single topology change event of type \(\Lambda\), the passage
predicate holds for every \(\sigma\in\Phi\)\@.
\end{alabsubtext}
%
\apar{3-2}
Although Definition~1 considers trajectories
with a single change, we emphasize that the
intent is to handle trajectories with multiple changes
(each change is completely accommodated before the next change occurs)\@.
Our definition could be modified to state this explicitly,
however, we have
chosen this simpler form to streamline presentations\@.
%
\apar{3-3}
A primary motivation for superstabilization is the notion of
a ``low-impact'' reaction by a protocol to dynamic change\@.
Intuitively, this means that changes necessary in response to
dynamic change should affect relatively few processors and links\@.
To formalize this idea, we introduce an adjustment measure\@.
To define an adjustment, we return to the concept of legitimacy
and a predicate \(\cal P\) that effectively characterizes the legitimacy
predicate \(\cal L\)\@.
Let \(\mathit{var}(\mathcal{P})\) be the minimal collection of variables and fields
upon which \(\cal P\) depends\@. Call \(\cal O\) the state-space ranging
only over the \(\mathit{var}(\mathcal{P})\) data\@. The expression \(\delta[\mathcal{O}]\)
denotes a system state projected onto the \(\cal O\) state-space\@.
Next we consider a function \(\mathcal{F}:\; \mathcal{O}\longrightarrow\mathcal{O}\)\@.
Function \(\cal F\) maps states \(\delta[\mathcal{O}]\) to
states \(\sigma[\mathcal{O}]\) that satisfy
\(\sigma\vdash\mathcal{L}\), where \(\delta\) is any state that can be
obtained from a legitimate state followed by a
\(\Lambda\)-topology change \(\cal E\)\@.
The idea is that \(\cal F\) represents the strategy
of a superstabilizing protocol in reacting to an
event \(\cal E\), namely, choosing
a new legitimate state following dynamic change\@.
We rank a function \(\cal F\) by means of an
adjustment measure \(\cal R\)\@. The adjustment measure
\(\cal R\) is the maximum number of processors having different
\(\cal O\)-states between \(\rho[\mathcal{O}]\)
and \(\mathcal{F}(\rho[\mathcal{O}])\), taken over
all states \(\rho\) derived from some state \(\delta\vdash\mathcal{L}\)
followed by some change event \(\mathcal{E}\in\Lambda\)\@.
A definition of \(\cal F\) with a
small adjustment measure \(\cal R\)
implies that few adjustments are necessary in response to a topology change\@.
%
\apar{3-4}
A superstabilizing protocol \emph{respects} \(\cal F\)
if the protocol implements \(\cal F\), meaning that it
responds to a dynamic change by some computation \(\Theta\) taking the
system from a state \(\rho[\mathcal{O}]\) to a state
\(\mathcal{F}(\rho[\mathcal{O}])\),
and changes \(\cal O\)-states at the minimum number of
processors necessary in order to establish the new legitimate state
given by \(\cal F\)\@. If a protocol respects
\(\cal F\), then we say that the protocol has adjustment measure
\(\cal R\)\@.
%
\apar{3-5}
The notion of adjustment measure can be regarded informally as a distance, in
terms of processor states, between legitimate states of different
configurations\@. It is plausible to consider other definitions of
distance upon which optimal adjustment could be based\@. For example,
if the state change of a particular processor could cause more damage
than changing the states of all other processors, a weighted definition
of adjustment measure would be appropriate\@. The general technique
described in Section~6 can be easily modified to support
alternative definitions of adjustment measure\@.
%
\apar{3-6}
To describe the time complexity of a protocol, the idea of
a \emph{cycle} is introduced\@.
A cycle for a processor \(p\) is a minimal
sequence of steps in a computation so that an iteration
of the protocol at processor \(p\) executes the program for \(p\)
from the first to the last statement\@.
All the programs of this paper are constructed so that a
processor \(p\)'s cycle consists of reading all of \(p\)'s
neighbor registers, performing some local computation, and writing
into \(p\)'s register\@.
The time-complexity of a computation is
measured by \emph{rounds}, defined inductively as follows\@.
Given a computation \(\Phi\), the first round of \(\Phi\) terminates at the
first state at which every processor has completed at least one
cycle; round \(i+1\) terminates after each processor has executed
at least one cycle following the termination of round \(i\)\@.
%
\apar{3-7}
The stabilization time of a protocol is the maximum number
of rounds it takes for the protocol to reach a legitimate state,
starting from an arbitrary state\@. The superstabilization time
is the maximum number of rounds it takes for a protocol starting
from an arbitrary legitimate state \(\sigma\), followed by
an arbitrary \(\Lambda\)-change event \(\cal E\),
to again reach a legitimate state\@.
\asection{4}{Superstabilizing Coloring} \label{coloring}
%
\apar{4-1}
This section exercises the definitions and notation developed
in Sections~2~and~3 for a simple allocation problem\@.
Let \(\mathcal{C}\) be a totally ordered set of colors
satisfying \(|\mathcal{C}| \geq 1+\Delta\), where \(\Delta\)
is an upper bound
on the number of neighbors a processor has in any trajectory\@.
Each processor \(p\) has a register field
\(\mathit{color}_p\)\@. The predicate \(\cal P\) of interest is:
\(\mathit{color}_p\in\mathcal{C}\) for every processor \(p\),
and no two neighboring processors have equal colors\@.
A legitimate state for the coloring protocol is any state
such that (1) predicate \(\cal P\) is satisfied, and (2)
for each computation that starts in such a state,
no processor changes color in the computation\@.
To define the passage predicate, we extend the domain of
a color field to include \(\perp\not\in\mathcal{C}\)\@. The passage
predicate is:
\(\mathit{color}_p\in\mathcal{C}\cup\{\perp\}\) for every processor \(p\)
and, for any neighboring processors \(p\) and \(q\),
\(\mathit{color}_p=\mathit{color}_q\) if and only if \(\mathit{color}_p=\perp\)\@.
%
\begin{figure}[tp]
\framebox[\columnwidth][l]{
\begin{minipage}{\columnwidth}
\smallskip
\begin{tabbing}
x \= xxxx \= xx \= xx \= xx \= xx \= \kill
\> \textbf{Self-Stabilizing Section}: \\
\> \textsf{S1} \> \(A,\;{B} ~:= ~\emptyset,\;\emptyset\) \\
\> \textsf{S2} \> \textbf{forall}~~\(q\in N_p\) \\
\> \textsf{S3} \>\> \textbf{do}\\
\>\>\>\>\(\textsf{read}(q);\)\\
\>\>\>\>\(A ~:= ~A \cup \mathit{color}_q;\)\\
\>\>\>\>\textbf{if}~~\(q>p\)~~\textbf{then}~~\(B\;
:=\; B \cup \mathit{color}_q\)\\
\>\>\>\textbf{od} \\
\> \textsf{S4} \> \textbf{if}~~\((~(\mathit{color}_p=\perp ~\wedge~
\perp\not\in B)~\vee~(\mathit{color}_p\neq\perp ~\wedge~
\mathit{color}_p\in B)~)\) \\
\>\>\textbf{then}~~\(\mathit{color}_p\; :=\;\mathit{choose}
(\mathcal{C}\setminus{A})\) \\
\> \textsf{S5} \> \textsf{write} \\
\> \textbf{Interrupt Section}: \\
\> \textsf{E1} \> \textsf{write}( \>\> \(\textbf{if}~(~\mathcal{E}=\textsf{recov}_p~\vee~(\mathcal{E}=\textsf{recov}_{pq}~\wedge~p>q)~)\)\\
\>\>\>\>\(\textbf{then}~~\mathit{color}_p\; :=\;\perp~\))
\end{tabbing}
\smallskip
%
\end{minipage}
}
\afigcap{1}{Superstabilizing coloring protocol for processor \(p\)} \label{colorprot}
\end{figure}
%
\apar{4-2}
Figure~1 presents a protocol for the coloring problem\@.
The function \(\mathit{choose}(S)\) selects the minimum
color from a set \(S\) (and is undefined if \(S\) is empty)\@.
The protocol of Figure~1 has two parts: one part is
a self-stabilizing protocol, modified to deal with the \(\perp\)
element; the other part lists the interrupt that
handles topology change events\@. Note that although \(\perp\) is
introduced to contend with topology changes, one must then consider
the possibility of an initial state
(for example, due to a transient fault)
in which \(\mathit{color}_p=\perp\) holds for every processor \(p\)\@.
The self-stabilizing section perpetually scans
for a color conflict with the set of neighboring processors having a
larger identifier\@. The interrupt statement writes to the register,
conditionally changing the \(\mathit{color}_p\) field in case the
topology change event is a restart of the processor or a link\@.
%
\begin{alabsubtext}{Theorem}{1}{} \label{col-stab}
The coloring protocol is self-stabilizing and converges
in \(O(n)\) rounds\@.
\end{alabsubtext}
%
\begin{alabsubtext}{Proof of Theorem}{1}{}
\prooffontshape
We show by induction on the number of processors,
that following round \(i\), \(0\leq i\leq n\),
the \(i\) largest-identifier processors have permanent, non-\(\perp\) color
assignments such that no conflict with a neighbor of higher identity
exists among these \(i\) processors\@.
The basis for the induction is trivial, since the empty
set of processors satisfies the assertion\@. Next, suppose the claim
holds following round \(k\)\@. We examine the effect of
round \((k+1)\) with respect to processor \(r\), where
\(r\) is the \((k+1)^{\textup{th}}\) largest processor identifier\@.
In this round, processor \(r\) chooses some
non-\(\perp\) color differing from
any color of a neighboring
processor with larger identity\@. The choice is
deterministic, based on the colors of the larger identities\@. By
hypothesis, these larger identity color assignments are permanent,
so following round \((k+1)\) and for all subsequent rounds, processor
\(r\)'s color is fixed and differs from the colors of all neighbors
of larger identity\@. Thus after
\((n+1)\) rounds, all processors have permanent color assignments\@.
{\nopagebreakbeginpar\markendlst{Proof of Theorem 1}}
\end{alabsubtext}
%
The class of topology events considered for the protocol is \(\Lambda(k)\),
\(0\leq k\leq \Delta\), which includes any
\textsf{crash} event, any link \(\textsf{recov}_{pq}\) event,
and any \(\textsf{recov}_p\) event, subject to the restriction that at most
\(k\) links incident on \(p\) recover at the same instant that \(p\) recovers;
thus, \(\Lambda(0)\) allows only processor or link recovery events that
are not simultaneous, whereas \(\Lambda(\Delta)\) includes the possibility of
a processor and all its links recovering as a single event\@.
%
\apar{4-3}
\begin{alabsubtext}{Theorem}{2}{}
The coloring protocol is superstabilizing
with a superstabilizing time of \(O(k)\) and an adjustment measure
\(\mathcal{R}=(k+1)\), where \(\Lambda(k)\) is the class
of topology change events\@.
\end{alabsubtext}
\begin{alabsubtext}{Proof of Theorem}{2}{}
\prooffontshape
In the case of any \textsf{crash} event,
the protocol remains in a legitimate state\@. In the
case of a \textsf{recov} event, for any new link introduced by the
event, one or both of the incident processors has color \(\perp\)
as a result; thus the passage predicate holds\@. Moreover, at most
\((k+1)\) processors can have color \(\perp\) as a result of the \textsf{recov}
event; by an argument similar to that given in the proof of
Theorem~1, a legitimate state is obtained in \(O(k)\) rounds, and
the passage predicate holds invariantly in the computation; the
adjustment measure \(\mathcal{R}=(k+1)\) follows from the fact that only those
processors incident on the change event adjust color during the
period of superstabilization\@.
{\nopagebreakbeginpar\markendlst{Proof of Theorem 2}}
\end{alabsubtext}
%
\apar{4-4}
The coloring protocol illustrates
qualitative and quantitative aspects of superstabilization\@.
The qualitative aspect is illustrated by the fact that
the convergence following a topology change does not violate
a passage predicate\@. This ensures better service to the
user when no catastrophe takes place
(i.e., in the absence of a severe transient fault)\@.
Quantitative aspects can be seen by the \(O(k)\)
convergence time and adjustment measure\@. The same protocol,
when started in an arbitrary initial state induced by a transient
fault, might take \(O(n)\) rounds to converge, and a processor could
change colors \(O(n)\) times during this convergence\@. Indeed, if
the superstabilizing component (namely, the interrupt statement)
of the protocol is removed, then \(O(n)\) rounds can be
required for convergence following even a single topology change
event starting from a legitimate state\@.
\asection{5}{The Superstabilizing Tree} \label{tree}
%
\apar{5-1}
Constructing a spanning tree in a network is
a basic task for many protocols\@.
Several distributed reset procedures, including self-stabilizing
ones, rely on the construction of a rooted spanning tree to control
synchronization\@. All existing deterministic self-stabilizing algorithms for
constructing spanning trees rely on processor or link identifiers to
select, for example, a shortest-path tree or a breadth-first search
tree\@. In a dynamic network, a change event can invalidate an
existing spanning tree and require that a new tree be computed\@.
Although computation is required when a change event \(\textsf{crash}_{pq}\)
removes one of the links in the current spanning tree, one would hope
that a change event \(\textsf{recover}_{pq}\) would require no
adjustment to an existing spanning tree\@. Most of the self-stabilizing
spanning tree algorithms we know
(e.g., \cite{cj97-04-13,cj97-04-05,cj97-04-08}) construct a BFS (i.e., breadth-first search)
or DFS (i.e., depth-first search) tree and thus require, in some cases, recomputation of
the tree when a link recovers, regardless of whether or not the network
currently has a spanning tree\@. The reason is that a processor
cannot locally ``know'' that the system has stabilized, and must make
a deterministic choice of edges to be included in the tree\@. We propose
a superstabilizing approach to tree construction, which is a variant
of the algorithm proposed in \cite{cj97-04-11}\@. The protocol given
in this section successfully ``ignores'' all dynamic changes that add
links to an existing spanning tree or crash links not contained
in the tree\@.
%
\apar{5-2}
All trajectories considered in this section are free of
\(\textsf{crash}_p\) or \(\textsf{recover}_p\) events; the number
of processors remains fixed at \(n\), and we give every
processor access to the constant \(n\)\@. We also suppose
that the network remains, at all states in a trajectory, connected\@.
%
\apar{5-3}
The basic idea of the protocol is the construction of a least-cost path
tree to a processor \(r\), which is designated as the root of the tree\@.
The key point of the protocol lies in the definition of link costs\@.
Each link is assigned a cost in such a way that links that are part of
the tree have low cost, whereas links outside the tree have high cost\@.
Each processor \(p\) has two register fields, \(t_p\) and \(d_p\)\@.
The field \(t_p\) ranges over identifiers of processors, and represents
the parent of \(p\) in the tree (by convention, we let \(t_r=r\))\@.
The register \(d_p\) contains a non-negative integer representing the
cost of a path from \(p\) to the root \(r\)\@.
%
\begin{figure}[tp]
\framebox[\columnwidth][l]{
\begin{minipage}{\columnwidth}
\smallskip
\begin{tabbing}
x \= xxxx \= xx \= xx \= xx \= xx \= \kill
\> \textbf{Self-Stabilizing Section}: \\
\> \textsf{S1} \> \(x,\; y ~:= ~\infty,\;\perp\) \\
\> \textsf{S2} \> \textbf{forall}~~\(q\in N_p\) \\
\> \textsf{S3} \>\> \textbf{do} \\
\>\>\>\>\(\textsf{read}(q);\)\\
\>\>\>\>\(\textbf{if}~~x>(d_q+w_{pq})~~\textbf{then}~~
x,\;y~:=~(d_q+w_{pq}),\; q\)\\
\>\>\>\textbf{od} \\
\> \textsf{S4} \> \(\textbf{if}~~p=r~~\textbf{then}~~
d_p,\; t_p ~:= ~0, \; r~~\textbf{else}~~
d_p,\; t_p ~:= ~x, \; y\) \\
\> \textsf{S5} \> \textsf{write} \\
\> \textbf{Interrupt Section}: ~~ \textbf{skip}
\end{tabbing}
\vspace*{-4ex}
\[ \text{where} ~~ w_{pq} ~ = ~ \left\{ \begin{array}{cc}
1 & \text{if} ~ t_p=q \\
n & \text{if} ~ t_p\neq q \end{array} \right. \]
\smallskip
%
\end{minipage}
}
\afigcap{2}{Superstabilizing tree protocol for processor \(p\)} \label{treeprot}
\end{figure}
%
\apar{5-4}
Figure~2 shows the code
of the superstabilizing spanning tree protocol\@. The predicate
\(\cal P\) of interest for the tree protocol is that
\((\forall p,q:\; p\neq r \;\wedge\; t_p=q: ~q\in N_p)\),
and that the collection of \(t_p\) variables
\(\{t_p\,|\; p\neq r\,\}\) represents a spanning,
directed tree rooted at \(r\)\@.
A legitimate state for the tree protocol is any state
such that (1) predicate \(\cal P\) is satisfied, and (2)
for each computation that starts in such a state,
no processor changes a \(t_p\) variable in the computation\@.
%
\begin{alabsubtext}{Theorem}{3}{} \label{tree-stab}
The spanning tree protocol self-stabilizes in \(O(n)\) rounds\@.
\end{alabsubtext}
%
(See Appendix A for the proof\@.)\\
%
\apar{5-5}
We define the class of change events \(\Lambda\) for purposes
of superstabilization to be any \(\textsf{recov}_{pq}\) event
or any \(\textsf{crash}_{pq}\) event such that neither
\(t_p=q\) nor \(t_q=p\) holds at the moment of the
\(\textsf{crash}_{pq}\) event; i.e., the \(p\)-\(q\) link is not
a link in the current tree\@.
The passage predicate for the superstabilization property
is identical to \(\mathcal{P}\)\@.
%
\begin{alabsubtext}{Theorem}{4}{} \label{tree-sup}
The spanning tree protocol is superstabilizing for the
class \(\Lambda\), with superstabilization time \(O(1)\) and
adjustment measure \(\mathcal{R}=1\)\@.
\end{alabsubtext}
%
(See Appendix A for the proof\@.)
\asection{6}{General Superstabilization} \label{gensuper}
%
\apar{6-1}
This section introduces a general method for achieving superstabilization
with respect to the class \(\Lambda\) of single topology changes\@.
Our general method can be seen as a compiler that takes
self-stabilizing protocol \(P\) and outputs a new
protocol \(P^s\) that is both self-stabilizing and superstabilizing\@.
This is done by modifying protocol \(P\) and superimposing a new
component, called the \emph{superstabilizer}\@. The superstabilizer
uses, as a tool, a self-stabilizing update protocol\@. The
following section describes our update protocol, after which we
give an overview of the superstabilizer\@.
\asubsection{6.1}{The Update Protocol}
%
\apar{6.1-1}
To simplify the presentation of our general method for superstabilizing
protocols, we employ a self-stabilizing update protocol\@.
We view the update protocol as the simplest and clearest
self-stabilizing protocol for a large class of tasks including
leader election, topology update, and diameter estimation\@.
Our update protocol enjoys a number of properties not directly
required for general superstabilization\@. Some of these properties
are stated and proved in Appendix B, since they may have useful
application to other results on stabilization\@.
We remark that our update protocol works in unidirectional
systems in which the existence of a communication
link from one node to another does not imply the existence
of a link in the opposite direction (e.g., \cite{cj97-04-04,cj97-04-02})\@.
%
\apar{6.1-2}
To describe the task of the update protocol, suppose every
processor \(p\) has some field image \(x_p\); for the moment,
we consider \(x_p\) to be a constant\@. The \emph{update} problem
is to broadcast each \(x_p\) to all processors\@. This problem is
called the \emph{topology update} when the field \(x_p\)
contains all the local information about
\(p\)'s links and network characteristics\@.
Many dynamic systems are already equipped with a topology update protocol
that notifies processors of the current topology; in such instances,
our general method acts as an extension to this existing topology update\@.
An optimal time (\(\Theta(d)\)
round) self-stabilizing solution to the topology update is given in
\cite{cj97-04-17,cj97-04-15}\@. To ensure a desired deterministic property of the
protocol, we assume that the neighborhood of a processor \(N_p\)
is represented as an ordered list\@.
%
\apar{6.1-3}
Let each processor \(p\) have, in addition to \(x_p\),
a field \(e_p\), where \(e_p\) contains
three tuples of the form \(\langle q, u, k \rangle\), in which
\(q\) is a processor identifier, \(u\) is of the same type as \(x_p\),
and \(k\) is a non-negative integer\@.
Let \(\mathit{dist}_\mathcal{T}(p,q)\) be the minimum
number of links contained in a path between
processors \(p\) and \(q\) in topology \(\cal T\)
(in arguments where the topology is understood from the context,
we write \(\mathit{dist}(p,q)\));
the third component of the tuple is intended to
represent the \emph{dist} value between \(p\) and the processor
named in the tuple's first component\@.
We make some notational conventions in dealing with tuples: with respect to
a given (global) state, \(\langle q,x_q,k\rangle\) is a tuple whose
second component contains the current value of field \(x_q\)\@. In proofs
and assertions, we specify tuples partially: \(\langle q,,\rangle\in e_p\)
is the assertion that processor \(p\)'s \(e\) field contains a tuple
with \(q\) as its first component\@.
Each processor uses local variables \(A\) and \(B\) that
range over the same set of
tuples that \(e_p\) does\@. For field image \(e_p\)
and set variables \(A\) and \(B\), we assume that set
operations are implemented
so that computations on these objects
are deterministic\@.
%
\begin{figure}
\framebox[\columnwidth][l]{
\begin{minipage}{\columnwidth}
\smallskip
\begin{tabbing}
\textsf{C99} \= xxxxx \= xx \= xx \= xx \= \kill
\> \textsf{C1} \> \(A,B := \emptyset,\emptyset\) \\
\> \textsf{C2} \> \(\textbf{forall}~~q\in N_p~~
\textbf{do}~~\textsf{read}(q);~~
A := A \cup e_q~~\textbf{od}\) \\
\> \textsf{C3} \> \(A := A\tupsub\langle p,\ast,\ast\rangle; ~~
A := A{\mbox ++}\langle\ast,\ast,1\rangle\) \\
\> \textsf{C4} \> \(\textbf{forall}~~q\in\textsf{processors}(A)~~
\textbf{do}~~B := B\cup\{ \textsf{mindist}(q,A)\}~~
\textbf{od}\) \\
\> \textsf{C5} \> \(B := B\cup\{\langle p,x_p,0\rangle\};~~
e_p := \textsf{initseq}(B)\) \\
\> \textsf{C6} \> \textsf{write}
\end{tabbing}
\smallskip
%
\end{minipage}
}
\afigcap{3}{Update protocol for processor \(p\)} \label{topo-a}
\end{figure}
%
\apar{6.1-4}
The update protocol's code uses the following definitions\@.
Let \(\textsf{processors}(A)\) be the list of
processor identifiers obtained from the
first components of the tuples in \(A\)\@.
Let \(\textsf{mindist}(q,A)\) be the first tuple
in \(A\) having a minimal third component
of any tuple whose first component is \(q\) (in case
no matching tuple exists, then \textsf{mindist}
is undefined)\@.
Define \(A\tupsub\langle q,\ast,\ast\rangle\)
to be the list of tuples obtained from \(A\) by removing every
tuple whose first component is \(q\)\@.
Define \(A{\mbox ++}\langle\ast,\ast,1\rangle\)
to be the list of tuples obtained from \(A\) by incrementing
the third component of every tuple in \(A\)\@.
Define \(\textsf{initseq}(A)\) by the following
procedure: first sort the tuples of \(A\) in ascending
order of the third element of a tuple;
then, from this ordered sequence of
tuples, compute the maximum initial prefix of
tuples with the property: if
\(\langle q,u,k\rangle\) and
\(\langle q',u',k'\rangle\) are successive
tuples in the prefix, then \(k'\leq k+1\)\@.
Then \(\textsf{initseq}(A)\) is the set of tuples
in this initial prefix\@.
%
\apar{6.1-5}
For the update protocol, we define a \emph{distance-stable}
state to be any state for which:
\textsf{(1)} each processor \(p\) has exactly one tuple
\(\langle q,y,dist(p,q)\rangle\) in its \(e_p\)
field for every processor \(q\) in the network reachable
by some path from \(p\) in the current topology; \textsf{(2)} \(e_p\)
contains no other tuples; and \textsf{(3)} each computation
that starts in such a state preserves \textsf{(1)} and \textsf{(2)}\@.
A \emph{legitimate} state for the update protocol is
a distance-stable state in which requirement \textsf{(1)} is
strengthened to: each processor \(p\) has exactly one tuple
\(\langle q,x_q,dist(p,q)\rangle\) in its \(e_p\)
field for every processor \(q\)---in other words, the
\(x\) field images are accurate\@.
Figure~3 presents the protocol\@.
%
\begin{alabsubtext}{Theorem}{5}{} \label{update-theorem}
The update protocol of Figure~3
self-stabilizes in \(O(d)\) rounds\@.
\end{alabsubtext}
%
(See Appendix B for proof\@.)\\
%
\apar{6.1-6}
A corollary of self-stabilization is that, if one of the
\(x_p\) fields is dynamically changed, the protocol will
effectively broadcast the new \(x_p\) value to other processors\@.
Of particular interest are some properties that relate
a sequence of changes to an \(x_p\) field to the sequence
of \(x_p\) values observed at another processor \(q\)\@. More
specifically, if processor \(p\) writes, over the course of a
computation, the values
\(c_1,c_2,\dots\) into \(x_p\), and no processor \(q\) reads
(via update images of \(x_p\)) a value \(c_k\) and then later reads a value
\(c_j\) for \(j (statements \textsf{C1--C3} of update protocol) \\
\> \textsf{U1} \> \textbf{if}~~\((a_p=1\;\wedge\;
(\exists q:: \; a_q\neq 0 \;\wedge\; q>p \;\wedge\;
\; (\forall r:: \; h_r[q]\neq 0)))\) \\
\>\>\> \textbf{then}~~\(a_p\;:=\;0\) \\
\> \textsf{U2} \> \textbf{if}~~\((a_p=1\;\wedge\;
(\forall q: \; q\neq p: \; a_q=0) \;\wedge\;
(\forall q:: \; h_q[p]\neq 0))\) \\
\>\>\> \textbf{then}~~\(a_p,\;u_p \;:=\; 2,\; \mathcal{F}(s_p)\) \\
\> \textsf{U3} \> \textbf{if}~~\((a_p=2\;\wedge\;
(\forall q:: \; h_q[p]\neq 1))\) \\
\>\>\> \textbf{then}~~\(a_p\;:=\; 0\) \\
\> \textsf{U4} \> \textbf{forall}~~\(q\in\textsf{processors}(A)\cup\{p\}\) \\
\>\>\> \textbf{do} \\
\> \textsf{U5} \>\> \textbf{if}~~\(a_q=0\)~~\textbf{then}~~
\(h_p[q] ~:= ~0\) \\
\> \textsf{U6} \>\> \textbf{if}~~\(a_q=1\;\wedge\;h_p[q]=0\)~~\textbf{then}~~
\(h_p[q] ~:= ~1\) \\
\> \textsf{U7} \>\> \textbf{if}~~\(a_q=2\;\wedge\;h_p[q]=1\)~~\textbf{then}~~
\(h_p[q],\;\textsf{snap}_p ~:= ~2,\; u_q[p]\) \\
\>\>\> \textbf{od} \\
\> \textsf{U8} \> \textbf{if}~~\((\exists q\in\textsf{processors}(A)\cup\{p\}::
\: a_q\neq 0)\) \\
\>\>\> \textbf{then}~~\(\mathit{freeze}_p\; ,\; t_p ~:=~
\mathit{true}, \; \textsf{snap}_p\) \\
\>\>\> \textbf{else}~~\(\mathit{freeze}_p~:= ~\mathit{false}\) \\
\> (statements \textsf{C4--C6} of update protocol) \\
\textbf{Interrupt Section:} \\
\> \textsf{E1} \> \textsf{write} \\
\> \>\> \(\left(
\begin{array}{lll}
a_p & := & 1 \\
\mathit{freeze}_p & := & \mathit{true} \\
t_p & := & \textsf{snap}_p \\
e_p & := & \left\{
\begin{array}{lll}
\emptyset & \text{if} & \mathcal{E}=\textsf{recov}_p \\
\textsf{refresh}(e_p) & \text{if} & \mathcal{E}\neq\textsf{recov}_p
\end{array}\right.
\end{array}\right)\)
\end{tabbing}
\smallskip
%
\end{minipage}
}
\afigcap{4}{Superstabilizer: update extension and interrupt for \(p\)} \label{stabilizer}
\end{figure}
\asection{7}{Conclusions} \label{conclusions}
%
\apar{7-1}
There is increasing recognition that dynamic protocols are necessary
for many networks\@. Studying different approaches to programming for
dynamic environments is therefore a motivated research topic\@. Although
self-stabilizing techniques for dynamic systems have been previously
suggested, explicit research to show how and where these techniques are
useful has been lacking\@. This paper shows how assumptions about interrupts
and dynamic change can be exploited with qualitative and quantitative
advantages, while retaining the fault-tolerant properties of self-stabilization\@.
%
\apar{7-2}
In particular, we suggest that when the system is in an illegitimate
configuration, reset to a predefined configuration will not take place;
instead, the system will reach a legitimate configuration that is
close to the current illegitimate configuration (where ``close'' means a
small adjustment measure)\@. The benefits of this approach are twofold.
First, such a strategy may keep most of the sites of the system unchanged
and in working order (as in the example of connections within
an unchanged portion of a spanning tree). Second, in some cases the
amount of work (superstabilizing time) required to reach a close
legitimate state can be small (as in our coloring example)\@.
%
\asection{Acknowledgment}{}
We thank Ajoy Kumar Datta and an anonymous referee
for helpful remarks\@.
\appendix
%
\asection{Appendix A:}{Spanning Tree Proofs}
%
%\apar{A-1}
%
\begin{alabsubtext}{Theorem}{3}{}
The spanning tree protocol self-stabilizes in \(O(n)\) rounds\@.
\end{alabsubtext}
%
\begin{alabsubtext}{Proof of Theorem}{3}{}
\prooffontshape
Proof by induction on an arbitrary computation \(\Phi\)\@.
The induction is based on a directed tree\@.
Let \(T_r\) be the maximum subset of processors whereby:
\begin{enumerate}
\item \(d_r=0\),
\item the set \(\{t_p\,|\;p\in T_r\;\wedge\; p\neq r\,\}\) represents a
directed tree rooted at \(r\),
\item for \(p\in T_r\) and \(p\neq r\),
register field \(d_p\) satisfies \(d_p=1+d_q\) where \(q=t_p\), and
\item each processor in \(T_r\) has executed at least one cycle in \(\Phi\)\@.
\end{enumerate}
After one round, \(d_r=0\) holds for the remainder of the computation\@.
Therefore, after the first round, \(T_r\) is nonempty,
containing at least \(r\)\@.
The remainder of the proof concerns rounds two
and higher, and is organized into the following three claims\@.
%
\begin{quote}
\begin{alabsubtext}{Claim}{1}{\mathversion{bold}\(T_r\) is Stable}
If \(p\in T_r\)
holds at the beginning of the round, then \(t_p\) and
\(d_p\) do not change during the round\@. The claim
follows by induction on depth of the tree \(T_r\)\@.
First we strengthen the hypothesis to state that
any node \(p\in T_r\) satisfies \(d_r=k\), where \(k\) is
the depth of \(p\) in \(T_r\)\@.
The basis for the induction is the root \(r\), which
satisfies \(d_r=0\); statement \textsf{S4} assures that
\(d_r\) does not change during the round\@.
Suppose all nodes of \(T_r\) up to depth
\(k\) satisfy the hypothesis, i.e., they are stable
and have a \(d\) field equal to the node depth\@.
Consider some \(v\in T_r\) at depth \(k+1\); by \textsf{(2)} and \textsf{(3)},
\(d_v=k+1\) at the beginning of the round\@. Since every \(d\) field has
non-negative value and \(w_{vz}=n\) for any \(z\neq t_v\), and assuming
the inductive hypothesis for the node named by \(t_v\), statement
\textsf{S3} cannot compute any lower value for \(x\) than \(k+1\), and
the values \(t_v\) and \(d_v\) do not change during the round\@.
\end{alabsubtext}
%
\begin{alabsubtext}{Claim}{2}{\mathversion{bold}\(T_r\) Growth}
If there exists a processor that is not contained in \(T_r\) and
\((\forall p:\; p\not\in T_r: \; d_p>2n)\) holds
at the beginning of the round, then \(T_r\) grows by
at least one processor by the end of the round\@.
The claim follows by examining processors outside
of \(T_r\) that also neighbor \(T_r\)\@. Let \(p\) be
such a processor, outside \(T_r\) and neighbor to \(q\in T_r\)\@.
By Claim 1, \(d_q+w_{pq}<2n\)\@. Therefore, during the round,
\(p\) cannot choose \(t_p\) to be some processor \(s\) satisfying
\(d_s>2n\)\@. Thus \(T_r\) grows by at least one processor\@.
\end{alabsubtext}
%
\begin{alabsubtext}{Claim}{3}{\mathversion{bold}\(d_p\) Growth}
Define \(M_i\) to be the minimum \(d\)-register value of
any processor outside of \(T_r\) in round \(i\); then
\(M_{i+1}>M_i\)\@. The claim is verified by considering,
for round \(i\) and \(p\not\in T_r\),
assignment to each \(d_p\) register
in that round\@. During a round, the value
obtained for \(d_p\) is strictly larger
than that of some neighboring \(d_q\); if
\(q\in T_r\), then \(p\in T_r\) holds at the
end of the round; and if \(q\not\in T_r\),
then the claim holds\@.
\end{alabsubtext}
\end{quote}
%
A corollary of Claim 3 is that following
rounds \(2n+2\) and higher, for every \(p\not\in T_r\),
the field \(d_p\) satisfies \(d_p>2n\)\@. Consequently,
for rounds \(2n+2\) and higher, by Claim 2, if \(T_r\) does not
contain all processors, then \(T_r\) grows by at
least one processor in each successive round\@.
The theorem follows because there are at most \(n\)
processors\@.
{\nopagebreakbeginpar\markendlst{Proof of Theorem 3}}
\end{alabsubtext}
%
\begin{alabsubtext}{Theorem}{4}{}
The spanning tree protocol is superstabilizing for the
class \(\Lambda\), with superstabilization time \(O(1)\) and
adjustment measure \(\mathcal{R}=1\)\@.
\end{alabsubtext}
%
\begin{alabsubtext}{Proof of Theorem}{4}{}
\prooffontshape
We show that starting from a state \(\delta\), \(\delta\vdash\mathcal{L}\),
followed by a topology change \(\cal E\), \(\mathcal{E}\in\Lambda\),
resulting in a state \(\sigma\), that \(\sigma\vdash\mathcal{L}\) holds\@.
In the case of \(\mathcal{E}=\textsf{crash}_{pq}\), removing a nontree link,
for either processor \(p\) or \(q\), the weight of the \(p\)-\(q\) link
\(w_{pq}=n\) at state \(\delta\). By assumption of \(\delta\vdash\mathcal{L}\),
it follows that computation of \(d\) and \(t\) fields produces identical
results in any round following \(\sigma\), since these are necessarily
based on unit \(w\) values\@. In the case of \(\mathcal{E}=\textsf{recov}_{pq}\),
the weight of the new \(p\)-\(q\) link is \(w_{pq}=n\) at state \(\sigma\);
hence distances are not reduced by addition of the new link, and
computation of \(d\) and \(t\) fields produces identical
results in any round following \(\sigma\)\@. Therefore \(\sigma\vdash\mathcal{L}\)\@.
{\nopagebreakbeginpar\markendlst{Proof of Theorem 4}}
\end{alabsubtext}
\asection{Appendix B:}{Update Protocol}
%
\apar{B-1}
In addition to showing that the update protocol presented in
Section~6 is self-stabilizing, additional theorems
given in this appendix show that the update protocol enjoys
some other properties related to broadcast monotonicity
and the protocol's use of memory\@. We begin with the
proof of self-stabilization\@.
\asubsection{B.1}{Self-Stabilization}
%
\apar{B.1-1}
%
\begin{alabsubtext}{Theorem}{5}{} \label{update-theorem}
The update protocol of Figure~3
self-stabilizes in \(O(d)\) rounds\@.
\end{alabsubtext}
%
\begin{alabsubtext}{Proof of Theorem}{5}{}
\prooffontshape
The proof is organized as three claims\@.
%
\begin{quote}
\begin{alabsubtext}{Claim}{4}{}
Following round \(i\), \(i\geq 1\), the \(e_p\) field of
every processor satisfies
\[ (\forall p,q,j:\;\; jd\)\@. Observe that if \(j>d+1\), then by the
construction of the \emph{initseq} function, at the
end of round \(d+1\) the field \(e_p\) also contains
some tuple \(\langle q,,k\rangle\) where \(k=d+1\)\@.
Thus to show the claim, it suffices to show a contradiction
for \(k=d+1\)\@. Since \(p\) assigned the tuple
\(\langle q,,d+1\rangle\) to \(e_p\) during round \(d+1\),
it must be that \(p\) found at some neighbor \(s\) the
tuple \(\langle q,,d\rangle\), and found no tuple
with \(q\) as the first component at a smaller distance\@.
However, the tuple located at \(s\) having distance
\(d\) represents the shortest distance to \(q\) by
Claim 5\@. And since \(d\) bounds the maximum possible
shortest path, by Claim 4 all shortest paths between
\(p\) and \(q\) are visible to \(p\) at the end of round
\(d\)\@. We conclude that \(dist(p,q)=d+1\), which contradicts
the definition of diameter \(d\)\@.
\end{alabsubtext}
\end{quote}
\sentence
%
Claims 4--6 together imply that, following \(d+1\) rounds,
each processor correctly has a tuple for every other processor
at distance \(d\), and every tuple in an \(e\) field
correctly refers to a processor\@.
%
{\nopagebreakbeginpar\markendlst{Proof of Theorem 5}}
\end{alabsubtext}
\asubsection{B.2}{Memory Adaptivity}
%
\apar{B.2-1}
Nowhere in the code of the update protocol is the size of the network used,
nor is a bound on the number of processors in a connected component assumed;
consequently, any number of processors can be dynamically added to the system,
provided processor identifiers are unique\@. Moreover, the local implementation
of operations on processor variables \(A\), \(B\), and even the field \(e_p\) can
use dynamic memory allocation\@. The following lemma shows that dynamic
memory operations do not use unbounded amounts of memory\@.
\begin{alabsubtext}{Lemma}{1}{}
For any computation \(\Phi\) of the update protocol, no processor requires
more than \(O(\Delta\cdot K\cdot n)\) space for variables and register fields,
where in the initial state of \(\Phi\):
\(\Delta\) is the maximum number of neighbors a processor has, \(n\)
is the number of processors, and
\(K\) is the maximum number of tuples of any processor's \(A\), \(B\), or
\(e\) field in the initial state of \(\Phi\)\@.
\end{alabsubtext}
%
\begin{alabsubtext}{Proof of Lemma}{1}{}
\prooffontshape
The computation of \(B\)
consists of at most \(nK\) tuples, since tuples with duplicate
identifiers are not added to \(B\) by \textsf{C4}, and the number of
identifiers is bounded by \(nK\)\@. Moreover, no statement is
capable of introducing a tuple with a processor identifier not
already present in another tuple\@. Hence any assignment by
\textsf{C5} places at most \(nK\) tuples in \(e_p\)\@.
The collection procedure for constructing \(A\) is the union of at most
\(\Delta\) sets of at most \(nK\) tuples (\(K\) tuples in the first round,
and \(nK\) tuples in subsequent rounds)\@.
{\nopagebreakbeginpar\markendlst{Proof of Lemma 1}}
\end{alabsubtext}
%
Although the lemma shows that the update protocol does not use
unbounded space in its computation, this is insufficient for a
self-stabilizing implementation: suppose processors are implemented
on machines with fixed memory limits and an initial state of a
computation is such that the number of tuples is at or near the
memory limit; subsequent computation may then abort by exceeding
the memory limit in a dynamic allocation request\@.
Therefore, in order to claim that the update protocol
is self-stabilizing, we assume that every trajectory's initial
state satisfies \(nK\leq\mathcal{N}\), where \(\cal N\) is some appropriate
limit related to memory limits of processors (even if the abort resets
memory, some minimal amount of memory is needed to
guarantee self-stabilization of the update protocol)\@.
%
\apar{B.2-2}
Note that upon stabilization, the \(e_p\) register
contains only those tuples representing reachable nodes in the network\@.
Therefore, the amount of memory needed for \(e_p\) can be dynamically
adjusted during a computation to the minimum amount needed to
represent the list of tuples; this
idea is called \emph{memory adaptivity\/} in \cite{cj97-04-03}\@.
The following lemma is an observation due to
Gerard Tel\@.\footnote{Remark during presentation, December 1993\@.}
%
\begin{alabsubtext}{Lemma}{2}{}
The update protocol of Figure~3
is memory adaptive\@.
\end{alabsubtext}
\begin{alabsubtext}{Proof of Lemma}{2}{}
\prooffontshape
Upon stabilization, the necessary size of the \(e\) field
is bounded by a function of the number of processors\@.
{\nopagebreakbeginpar\markendlst{Proof of Lemma 2}}
\end{alabsubtext}
\asubsection{B.3}{Monotonicity Properties}
%
\apar{B.3-1}
We distinguish three \emph{monotonicity} properties of
an update protocol:
%
\begin{itemize}
\item \emph{Static monotonicity}\@.
Let \(\sigma\) be a legitimate state for the update
protocol where \(x_p=c_0\) at \(\sigma\)\@.
Suppose \(\Phi\) is a topology-constant computation
originating with state \(\sigma\), and suppose field \(x_p\) is
changed at distinct states \(\delta_1,\delta_2,\dots\)
of \(\Phi\) to have the values \(c_1,c_2,\dots\), where state
\(\delta_i\) occurs before \(\delta_j\) for \(i