%MWM 18 July 1999
\documentstyle[12pt]{article}

%% Whoa!!!

\lefthyphenmin=2
\righthyphenmin=3


%au query
\newcounter{querno}
\setcounter{querno}{1}
\newcommand{\auquery}[1]{\marginpar{\raggedright\scriptsize 
\thequerno. au: #1}\addtocounter{querno}{1}}
\newcommand{\query}[1]{\marginpar{\raggedright\scriptsize #1}}

%operators and MM macros
\def\operator#1{\mathop{\it#1}\nolimits}
\newcommand{\peek}{{\operator{peek}}}
\newcommand{\arglist}{{\operator{arglist}}}
\newcommand{\call}{{\operator{call}}}
\newcommand{\readit}{{\operator{read}}}
\newcommand{\writeit}{{\operator{write}}}
\newcommand{\resp}{{\operator{resp}}}
\newcommand{\retlist}{{\operator{retlist}}}
\newcommand{\ret}{{\operator{ret}}}
\newcommand{\ack}{{\operator{ack}}}
\newcommand{\OP}{{\operator{OP}}}
\newcommand{\op}{{\operator{op}}}
\newcommand{\ops}{{\operator{ops}}}
\newcommand{\WRITE}{{\operator{WRITE}}}
\newcommand{\READ}{{\operator{READ}}}
\newcommand{\RSUC}{{\operator{RSUC}}}
\newcommand{\RCUS}{{\operator{RCUS}}}
\newcommand{\rsuc}{{\operator{rsuc}}}
\newcommand{\rcus}{{\operator{rcus}}}
\newcommand{\RCUL}{{\operator{RCUL}}}
\newcommand{\RLUS}{{\operator{RLUS}}}
\newcommand{\rcul}{{\operator{rcul}}}
\newcommand{\rlus}{{\operator{rlus}}}
\newcommand{\uc}{{\operator{uc}}}
\newcommand{\UC}{{\operator{UC}}}
\newcommand{\rs}{{\operator{rs}}}
\newcommand{\RS}{{\operator{RS}}}
\newcommand{\Maxdom}{{\operator{Maxdom}}}
\newcommand{\NC}{{\operator{NC}}}
\newcommand{\CG}{{\operator{CG}}}
\newcommand{\RCG}{{\operator{RCG}}}
\newcommand{\VC}{{\operator{VC}}}
\newcommand{\MOP}{{\operator{MOP}}}
\newcommand{\mop}{{\operator{mop}}}
\newcommand{\AOP}{{\operator{AOP}}}
\newcommand{\aop}{{\operator{aop}}}
\newcommand{\FIND}{{\operator{FIND}}}
\newcommand{\INS}{{\operator{INS}}}
\newcommand{\UP}{{\operator{UP}}}
\newcommand{\DoMop}{{\operator{DoMop}}}
\newcommand{\DoOp}{{\operator{DoOp}}}
\newcommand{\INC}{{\operator{INC}}}
\newcommand{\HALF}{{\operator{HALF}}}
\newcommand{\FOP}{{\operator{FOP}}}
\newcommand{\fop}{{\operator{fop}}}
\newcommand{\SELFOP}{{\operator{SELFOP}}}
\newcommand{\selfop}{{\operator{selfop}}}
\newcommand{\scratch}{{\operator{scratch}}}
\newcommand{\shift}{{\operator{shift}}}
\newcommand{\WOP}{{\operator{WOP}}}
\newcommand{\Wop}{{\operator{Wop}}}
\newcommand{\SOP}{{\operator{SOP}}}
\newcommand{\Sop}{{\operator{Sop}}}
\newcommand{\WADD}{{\operator{WADD}}}
\newcommand{\WMUL}{{\operator{WMUL}}}
\newcommand{\Wadd}{{\operator{Wadd}}}
\newcommand{\Wmul}{{\operator{Wmul}}}
\newcommand{\wMUL}{{\operator{WMUL}}}
\newcommand{\WREAD}{{\operator{WREAD}}}
\newcommand{\Wread}{{\operator{Wread}}}
\newcommand{\WAOP}{{\operator{WAOP}}}
\newcommand{\Waop}{{\operator{Waop}}}
%\newcommand{\}{{\operator{}}}

%\setlength{\textheight}{9.0in}
%\setlength{\topmargin}{0.25in}

%\makeatletter

%\makeatother

\setcounter{footnote}{1}

\newtheorem{lemma}{Lemma}
\newtheorem{claim}[lemma]{Claim}
\newtheorem{theorem}[lemma]{Theorem}
\newtheorem{corollary}[lemma]{Corollary}
\newtheorem{formalrule}[lemma]{Rule}

\newcommand{\qedsymb}{\mbox{ }~\hfill~{\rule{2mm}{2mm}}}
\newenvironment{proof}{\begin{trivlist}
\item[\hspace{\labelsep}{\bf\noindent Proof\,}]
}{\qedsymb\end{trivlist}}
\newenvironment{proofof}[1]{\begin{trivlist}
\item[\hspace{\labelsep}{\bf\noindent Proof of #1\,}]
}{\qedsymb\end{trivlist}}


\title{%\setlength{\baselineskip}{1pc}
%\bf 
Time Bounds for Strong and Hybrid Consistency for Arbitrary Abstract
Data Types\\}

\author{
{Martha J. Kosa} \\
%{Department of Computer Science}\\
{(Tennessee Technological University)}\\
\texttt{http://www.csc.tntech.edu/\char`\~mjkosa/}\\
mjk9504@tntech.edu
%{Box 5101}\\
%{Cookeville, TN  38505}\\
%{mjkosa@tntech.edu}\\%email addr correct --MM
}

\date{2 September 1999}

\begin{document}

%%
%% Generic Titlepage, Insert after \begin{document}
%%
\def\registered{\({}^{\ooalign%
    {\hfil\raise.07ex\hbox{{\tiny\textup{R}}}%
    \hfil\crcr\scriptsize\mathhexbox20D\cr}}\)}
\begin{titlepage}
\pagenumbering{roman}
\null\vfil\vskip 60pt
\begin{center}
{\Huge\bf Chicago\nolinebreak[1] Journal of Theoretical Computer\nolinebreak[1] Science\par}
\vskip 3em
{\huge\it The MIT Press\par}
 \end{center}
\par\vskip 1.5em
{\Large
  \begin{center}
    Volume 1999, Article 9\\
    \textit{Time Bounds for Strong and Hybrid Consistency for Arbitrary Abstract Data Types}
  \end{center}
}
\par\noindent
ISSN 1073--0486. MIT Press Journals, Five Cambridge Center, Cambridge, MA
02142-1493 USA; (617)253-2889;
\emph{journals-orders@mit.edu, journals-info@mit.edu}.
Published one article at a time in \LaTeX\ source form on the
Internet. Pagination varies from copy to copy. For more
information and other articles see:
\begin{itemize}
  \item \emph{http://mitpress.mit.edu/CJTCS/}
  \item \emph{http://www.cs.uchicago.edu/publications/cjtcs/}
  \item \emph{ftp://mitpress.mit.edu/pub/CJTCS}
  \item \emph{ftp://cs.uchicago.edu/pub/publications/cjtcs}
\end{itemize}

\par\noindent
The \emph{Chicago Journal of Theoretical Computer Science} is
abstracted or indexed in \emph{
Research Alert,\registered
SciSearch, \registered
Current Contents\registered/Engineering
Computing \& Technology}, and \emph{CompuMath Citation Index.\registered}

\par\noindent\copyright 1999 The Massachusetts Institute of Technology.
Subscribers are licensed to use journal articles in a variety of
ways, limited only as required to ensure fair attribution to authors
and the journal, and to prohibit use in a competing commercial
product. See the journal's World Wide Web site for further
details. Address inquiries to the Subsidiary Rights Manager, MIT
Press Journals; (617)253-2864;
\emph{journals-rights@mit.edu}.\pagebreak[2]

\par\noindent The \emph{Chicago Journal of Theoretical Computer Science}
is a peer-reviewed scholarly journal in theoretical computer
science. The journal is committed to providing a forum for
significant results on theoretical aspects of all topics in computer
science.
\par
\begin{trivlist}
  \item[\emph{Editor-in-Chief:}] Janos Simon
  \item[\emph{Consulting Editors:}]
    Joseph Halpern, Eric Allender, Raimund Seidel
  \item[\emph{Editors:}]
    \begin{tabular}[t]{lll}
	Martin Abadi & Greg Frederickson & John Mitchell \\
	Pankaj Agarwal & Andrew Goldberg & Ketan Mulmuley \\
	Georg Gottlob & Gil Neiger & Tetsuo Asano  \\
	Vassos Hadzilacos & David Peleg & Laszl\'o Babai \\
	Juris Hartmanis & Andrew Pitts & Eric Bach \\
	Maurice Herlihy & James Royer & Stephen Brookes \\
	Ted Herman & Alan Selman & Jin-Yi Cai \\
	Stephen Homer & Nir Shavit & Anne Condon \\
	Neil Immerman & Eva Tardos & Cynthia Dwork \\
	Howard Karloff & Sam Toueg & David Eppstein \\
	Philip Klein & Moshe Vardi & Ronald Fagin \\
	Phokion Kolaitis & Stuart Kurtz & Jennifer Welch \\
	Lance Fortnow & Stephen Mahaney & Pierre Wolper \\
	Steven Fortune & Michael Merritt
    \end{tabular}
  \vspace{1ex}
  \item[\emph{Managing Editor:}] Michael J.\ O'Donnell
  \vspace{1ex}
  \item[\emph{Electronic Mail:}] \emph{chicago-journal@cs.uchicago.edu}
\end{trivlist}
\vfil\null
\end{titlepage}
\maketitle
\pagenumbering{arabic}
%% 
%% End Titlepage code
%%



\thispagestyle{empty}

\subsection*{\centering Abstract}
We compare the costs of three well-known consistency guarantees (sequential
consistency, linearizability, and hybrid consistency) for shared memory objects
of {\it arbitrary\/} data types, generalizing previous work on specific
types.  We
evaluate the impact of the desired consistency guarantee, the amount
of synchrony among the users of the objects, and algebraic properties
of a type on the worst-case time complexity of
operations of the type.  These algebraic properties are sufficient for proving
many lower bounds and some upper bounds (even 0,
meaning only local computation is required) on the worst-case times.  Under
certain reasonable assumptions, sequential consistency and linearizability are
equally costly in perfectly synchronized systems, linearizable operations are
more expensive in approximately synchronized systems than in perfectly
synchronized systems, sequentially consistent operations are cheaper than
linearizable operations in approximately synchronized systems, and hybrid
consistency is not necessarily cheaper than sequential consistency or
linearizability.
\section{Introduction}
\subsection{Background material}
In concurrent systems, processes 
need to share information with each other.
Because of software engineering concerns,
using shared data objects 
from arbitrary abstract data types is a popular
method for organizing this information.
However, processes may not have access to a
physical shared memory, because they may be running on computers
separated by long distances or their computer architecture may not
provide a physical shared 
memory.  They must simulate a physical shared
memory with a virtual shared memory.

A consistency 
guarantee tells the processes 
using the virtual shared objects what they can
expect about the values returned 
as the result of applying operations, even when
operations are executed concurrently 
on the same virtual object.  Researchers
have defined many types of consistency 
guarantees---some strong, some
weak, and some in between.  A 
strong guarantee is at least as expensive to
implement as a weaker guarantee, 
but it may be impossible (or very hard) to
solve a problem by using virtual shared objects providing a weaker
guarantee (see~\cite{cj99-09-04}).

Sequential consistency and linearizability are two strong
consistency guarantees.  They ensure 
that operations appear to have executed
atomically in some sequential order 
that reflects the order in which operations
were executed at each process.  In addition, 
linearizability ensures that
this sequential order preserves the relative ordering of all
nonoverlapping operations, even 
those executed by different processes.
Sequential consistency is a very 
popular consistency guarantee, used for
virtual shared memories and multiprocessor 
caches (see~\cite{cj99-09-02,cj99-09-05}).  However,
linearizability is a
stronger, more intuitive guarantee, 
because sequential consistency provides
no clues about the relative ordering of 
nonoverlapping operations performed
by different processes.  Although linearizability is more
powerful than sequential 
consistency, it is still interesting to study both
guarantees;  the difference between their definitions is very slight.

Attiya and Friedman~\cite{cj99-09-04} investigated 
practical shared memory consistency
models (refer to~\cite{cj99-09-01,cj99-09-07,cj99-09-09,cj99-09-10}) 
and formally
defined hybrid consistency, a generalization of those models; 
it systematically
weakens 
either sequential consistency or linearizability.
In hybrid
consistency, there are strong and weak operations.
Strong operations appear to satisfy 
a strong consistency guarantee, while
weak operations can appear to execute 
in different orders at different
processes as long as the relative 
ordering of weak and strong operations
executed by the same process is preserved.

System designers should understand the costs of providing these consistency
guarantees in order to choose
a cost-effective consistency guarantee that 
best suits the needs of their applications.  
We focus on (distributed) 
message-passing implementations
because they are scalable.  Each process has a local memory, and all
processes run a protocol to emulate a physical shared memory
with a given consistency guarantee.

Attiya and Welch~\cite{cj99-09-05} compared the costs
of implementing sequential consistency and linearizability 
for basic read/write
objects, queues, and stacks.  They measured the worst-case 
time for operations
to complete, giving upper and 
lower bounds to show that the amount of 
process synchrony caused the cost difference between
sequential consistency and linearizability to vary.  With perfect
synchrony, their costs are 
equal.  However, sequential consistency is cheaper
when processes are only approximately synchronized.
Mavronicolas and 
Roth~\cite{cj99-09-17} continued 
the Attiya-Welch 
comparative study, improving some of their
lower bounds and giving a distributed implementation of linearizable
read/write objects in a system 
with only approximately synchronized processes.
Attiya and Friedman~\cite{cj99-09-04} compared the cost of hybrid
consistency with the costs of sequential consistency and
linearizability for read/write 
objects, showing that hybrid consistency is
cheaper when mostly weak operations are executed.   
Friedman~\cite{cj99-09-08} compared the cost of 
hybrid consistency with the costs of stronger guarantees
for read/modify/write objects, queues, and stacks,
showing that hybrid consistency is not cheaper, even when mostly weak
operations are executed.

\subsection{Overview of results}
Instead of concentrating on specific data types, as in earlier
work, we study the worst-case response times for operations of
{\it arbitrary\/} abstract 
data types in sequentially consistent, linearizable,
and hybrid consistent implementations.  We show that 
algebraic properties of
the operations of an abstract data type 
are sufficient for proving many lower
bounds and some upper bounds on 
the worst-case response times.  Our work
generalizes and unifies previously 
known results, for 
example, \cite{cj99-09-04,cj99-09-05,cj99-09-08,cj99-09-17}.
Some of these algebraic 
properties, such as equivalence and commutativity,
were defined in Weihl's study of concurrency control and recovery
in transaction systems~\cite{cj99-09-18}.
Weihl's definitions helped us to identify
interesting new algebraic properties of operations.

We now give a brief description of our results.
Let $d$ be the maximum message delay of 
a system of processes connected by a
network, and let $u$ be the 
uncertainty in the message delay ($0 \le u < d$).  This
implies that the actual message delay may vary between $d-u$ and $d$.

In the first part of the paper, we
assume that processes have perfectly synchronized clocks and constant
message delays ($u=0$).  Any lower bounds can
hold in systems with weaker, more realistic timing assumptions.
We consider interactions among operations
to determine the worst-case completion times for operations
of the abstract data type to be implemented.
We find several algebraic properties that 
cause either a single operation or
a pair (collectively) to have a 
worst-case completion time of at least $d$ or $2d$.

It is often the case that a particular operation 
or group of operations of an
abstract data type is
known to be used most frequently.  Thus it is 
desirable to optimize (in terms
of worst-case completion time) that
operation or group of operations.  We investigate conditions that
allow this operation or the operations of the group to have
worst-case completion times of $0$ (to be {\em fast}),
meaning that an operation can return immediately based on
current local information at its invoking process, in a linearizable
implementation.
Making one operation fast may require another operation to be
{\em slow}, that is, to have 
a worst-case completion time of $\Omega(d)$. 

Operations may be classified as accessors, 
pure mutators, self-oblivious
operations, immediately
self-commuting operations, or none of the above.
An {\em accessor\/} returns information about
an object without changing 
it.  A {\em pure mutator\/} changes an object
without returning any information 
about it.  The information returned by a
{\em self-oblivious\/} operation 
does not depend on operations with the same name that
are executing at nearly the 
same time.  The information returned by an
{\em immediately self-commuting\/} 
operation does not depend on operations with
the same name that are executing simultaneously.
Figure~\ref{op-rels} shows the relationships among these
classes of operations and 
describes the smallest possible worst-case completion
time (fast or slow) of a single 
operation
in these classes.\footnote{In the case of 
accessors and pure mutators, time is given for a
group of operations.}
\begin{figure}[tbh]
\begin{center}
\setlength{\unitlength}{0.01in}
\begin{picture}(300,300)(100,100)
\thicklines
\put(225,225){\oval(300,300)}
\put(225,225){\oval(250,250)}
\put(225,225){\oval(200,200)}
\put(225,225){\oval(90,50)}
\put(225,287.5){\oval(90,50)}
\put(195,212){\shortstack{pure\\mutator}}
\put(195,284){\shortstack{accessor}}
\put(135,331){\shortstack{immediately self-commuting}}
\put(122,358){\shortstack{not immediately self-commuting}}
\put(187,158){\shortstack{self-oblivious}}
\put(135,215){\shortstack{f\\a\\s\\t}}
\put(105.5,200){\shortstack{u\\n\\k\\n\\o\\w\\n}}
\put(79,215){\shortstack{s\\l\\o\\w}}
\end{picture}
\end{center}
\caption{Operations and smallest worst-case completion 
times\label{op-rels}}
\end{figure}%1

Our general results for systems with perfectly 
synchronized clocks and no
uncertainty in the message delay are that any single
self-oblivious operation 
can be made fast and that no operation that does not
immediately commute with itself can be made 
fast.  Still unknown is the
smallest possible worst-case completion time for a single operation
that is not self-oblivious but does immediately commute with itself.  However,
a large class of 
well-known abstract data types has self-oblivious operations.

We can use the lower and upper bounds proven to deduce that
sequential consistency and 
linearizability are equally costly in systems with
perfectly synchronized clocks when the expected
completion time to perform all 
operations in a linearizable implementation
matches the lower bound for 
all operations under sequential consistency.
Computing the expected completion 
time requires knowledge of the frequency
of executing each operation for the abstract data type.  If
$\op_1, \ldots, \op_n$ are the operations for an abstract data type and
$p_i$ is the frequency of execution for each $\op_i$,
where \(\sum_{i=1}^{n} |p_i| = 1\), 
the expected completion time for a single
operation is \(\sum_{i=1}^{n} p_i |\OP_i| \).
      
In the second part of the paper, we
consider systems with positive uncertainty in the message delay
and approximately synchronized clocks.  
Our goal is to find a list of
algebraic properties that cause operations 
(which satisfy a property in the list)
to have positive worst-case completion
times ($\Omega(u)$) in linearizable implementations
but worst-case completion times of $0$ in
sequentially consistent implementations.
Sequential consistency is then less expensive than linearizability
in this model of process synchrony.

We exhibit two properties that cause 
operations to have worst-case completion
times of at least $u/2$ in linearizable implementations.
An operation satisfying one property or
a restriction of the other property can 
be implemented with a worst-case
completion time of
$0$ in a sequentially consistent implementation.
These properties hold for a large class of
well-known abstract data types.  Under these conditions, sequential
consistency is cheaper than linearizability. 

Finally, we
consider the weaker guarantee of hybrid consistency.
We use many of the algebraic properties used for
sequential consistency 
to yield similar lower bounds for hybrid consistency,
and we deduce that hybrid consistency is not necessarily
cheaper than stronger guarantees.

\subsection{Organization of this paper}
Section~\ref{defs} contains general definitions about abstract data
types and definitions of the three correctness
guarantees.  Section~\ref{perfect} discusses the costs of the strong
guarantees in systems
with perfectly synchronized clocks, with subsections for lower bounds
(sequential consistency) and upper bounds (linearizability).
Section~\ref{approx} discusses the 
costs of the strong guarantees in systems
with only approximately
synchronized clocks, with 
subsections for linearizability (lower bounds) and
sequential consistency 
(upper bounds).  Section~\ref{study-hc} discusses the
costs of hybrid consistency.
We summarize our results in
Section~\ref{summary}.
\section{Definitions}
\label{defs}

A {\em sequential specification\/}
(see \cite{cj99-09-11}) for an abstract data type contains a
set of {\em operations}, which
are ordered pairs of call-and-response events, and a set of legal
operation sequences.  The legal
sequences of operations reflect the semantics of the abstract data
type.  For example, for read/write objects, each read operation in a
legal sequence returns the value written by the last preceding write
operation in the sequence.
The call event represents the
call for the corresponding operation from the abstract data type,
while the response event represents 
the value returned by applying the
operation.  We refer to call 
events and their matching response events
separately because they may not 
occur atomically; they may be separated in time.
For operation $\op$, a call event is of the form $\call(\arglist)$, 
where $\arglist$ is 
the argument list (possibly empty) for the operation.  
For example, a call event for a read operation (respectively, 
write operation)
on a read/write object is of the form $\readit()$ 
(respectively, $\writeit(v)$).
Similarly, a response event is of the form
$\resp(\retlist)$, where $\retlist$ 
is the list of values returned by the
operation and is possibly empty.  For example, a response event
for a read operation (respectively, write 
operation) on a read/write object
is of the form $\ret(v)$ (respectively, $\ack()$).
We can combine the call-and-response events to yield
$\op(\arglist)(\retlist)$.
In our running example,
we get $\readit()(v)$ and $\writeit(v)()$.
We can partition all operations 
into equivalence classes.  All operations
with the same name for their call events are in the same
{generic operation}, or operation type.
$\OP_i$ denotes a generic operation; $\op_i$ and 
$\op_i^j$ denote instantiations
of $\OP_i$ (i.e., nongeneric operations).

The notion of sequential specification implicitly assumes some initialization
of the object; 
for example, $\readit()(3)$ is 
legal for a read/write object that has $3$
as its initial value.  We assume the ability to explicitly initialize an
object $O$ using any sequence of operations $\rho$ that is legal for $O$.
Formally, the object $O_\rho$ is 
a {\em $\rho$-initialized version of $O$} if it
has the same set of operations as $O$, and the 
operation sequence $\sigma$ is legal
for $O_\rho$ if and only if $\rho \circ \sigma$
is legal
for $O$.  The assumption of arbitrary explicit initialization is not
unreasonable,
since initialization normally occurs at the beginning of program executions.

We present two basic algebraic properties of operations from~\cite{cj99-09-18}.
Let $\alpha$ and $\beta$ be operation sequences.  $\alpha$
{\em looks like} $\beta$ if for every operation
sequence $\gamma$, 
$\alpha \circ \gamma$ is legal only if $\beta \circ \gamma$
is legal.
If $\alpha$ looks like $\beta$, then the user of the abstract data type 
never sees the result of an operation that allows the user to distinguish
$\beta$ from $\alpha$ after $\alpha$ is executed.  If $\alpha$ looks like
$\beta$ and $\beta$ looks like $\alpha$, then $\alpha$ and $\beta$ are
{\em equivalent}.
Future operations cannot distinguish between $\alpha$ and
$\beta$.

The preceding definitions are stated in terms of sequential
operations on a single object.  Now we need to enhance our
notation to handle multiple shared objects in concurrent systems.  If $e$ is a
call event, response event, or whole operation,
then $e[O,p]$ denotes that $e$ is performed by a process $p$ on an 
object $O$.  If $\alpha$ is a sequence of operations, then
$\alpha[O,p]$ denotes that all operations in 
$\alpha$ are performed by a process
$p$ on an object $O$.  A 
sequence $\tau$ of operations for a set of objects is
legal if, for 
each object $O$, the restriction of $\tau$ to operations of
$O$, denoted $\tau|O$, is legal for $O$'s abstract data
type.

We now describe the system model.\footnote{Our system model is
the same as in~\cite{cj99-09-05}.}
A {\em memory consistency system\/} (MCS) 
is a set of processes $P$ and a set of
clocks $\cal C$, one for each $p$ in $P$.
Our assumed system consists of a collection of nodes connected by a network.
An application program, a real-time clock, and an 
MCS process are running on each node.
An MCS process can read the real-time
clock residing at its node.  A {\em clock} is a monotonically increasing
function from real time to clock time (both sets are the set of real numbers).
A process cannot modify the real-time clock.  Processes can only obtain 
information about time from their clocks.

The following events can occur at the MCS process on node $p$:

\begin{itemize}

\item A {\em call event\/} occurs when the application program on node $p$
accesses a shared object.

\item A {\em response event\/} occurs when the MCS process on node $p$ gives
a response from a shared object to node $p$'s application program.

\item {\em Message receive events\/} are of the form 
receive$(p,m,q)$ for all
messages $m$ and all nodes $q$.  A message receive event occurs when the MCS
process on node $p$ receives message $m$ from the MCS process on node $q$.

\item {\em Message send events\/} are of the form send$(p,m,q)$ for all messages
$m$ and all nodes $q$.  A message send event happens when the MCS process on
node $p$ sends message $m$ to the MCS process on node $q$.

\item{\em Timer set events\/} are of the form timerset$(p,T)$ for all clock 
times $T$.  This means that $p$ sets a timer to go off when its clock reads
$T$.

\item{\em Timer events\/} 
are of the form timer$(p,T)$ for all clock times $T$.
This means that a timer that was set for time $T$ on $p$'s clock goes off.

\end{itemize}

Call, message receive, and timer events are {\em interrupt events}.

An {\em MCS process\/} (or just {\em process}) 
is an automaton with a set of states,
including an initial state, and a transition function.  Each interrupt event
causes the transition function to be applied.  The transition function is
a function from states, clock times, and interrupt events to states, sets of
response events, sets of message send events, and sets of timer set events
(for future clock times).  This means that the transition function takes as
input the current state, clock time, and interrupt event and then produces a
new state, a set of response events for the application process, a set of
messages to be sent, and a set of timers to be set for the future.

A {\em step\/} of $p$ is a tuple $(s,T,i,s',R,M,S)$, where $s$ and $s'$ are
states ($s$ is the current state and $s'$ is the new state), $T$ is a clock
time, $i$ is an interrupt event, $R$ is a set of
response events, $M$ is a set of message send events, $S$ is a set of timer
set events, and $s'$, $R$, $M$, 
and $S$ are the results of $p$'s transition function
acting on $s$, $T$, and $i$.

A {\em history\/} of a process $p$ with clock $C$ is a countable sequence of
steps such that the following hold:

\begin{itemize}

\item Steps are ordered by $T$, their time components, in increasing order.

\item The old state in the first step is $p$'s initial state.

\item The old state of each subsequent step is the new state of the previous
step.

\item For the subsequence of steps with time component $T = t$, all nontimer
events are ordered before any timer event, and there is at most one timer
event.

\end{itemize}

An {\em execution\/} of an MCS is a
set of histories, one for 
each process $p$ in $P$ with clock $C_p$ in $\cal C$,
which satisfies the following 
two conditions:  First, for all pairs of processes
$p$ and $q$, every message sent from $p$ to $q$ is received by $q$, and
every message received by $q$ from $p$ was actually sent by $p$
(reliable message transmission and no duplicated messages).   We use this
one-to-one correspondence to define the {\em delay\/} of any message in an
execution to be the difference between the real time of receipt and the real 
time of sending.  Second, a timer is received by $p$ at clock time $T$ if and
only if $p$ has previously set a timer for $T$.

An execution $\rho$ is {\em admissible\/} if the following are true:

\begin{itemize}

\item For every $p$ and $q$, every message in $\rho$ from $p$ to $q$ has
delay in the range $[d-u,d]$, for fixed nonnegative integers $d$ and $u$,
$u \le d$.

\item For every $p$, at most one call at $p$ is pending (lacks a matching
response) at any given time.

\end{itemize}

Our definitions of the correctness conditions are identical to those
found in~\cite{cj99-09-05,cj99-09-04}.

Given an execution $\sigma$, let $\ops(\sigma)$ be the sequence of 
call-and-response events 
appearing in $\sigma$ in real-time order.  We need to specify
a tie-breaking mechanism for ordering events that
occur at the same real time
$t$.  In this ordering, the first group of events is formed by the response
events that
happen at time $t$ and whose matching call events happen before
time $t$, ordered by their process identifiers.  The second 
group of events in
the ordering is formed by the call events that happen at time $t$ and whose
matching response events happen at time $t$, ordered by their process
identifiers with a call event immediately preceding its matching response
event.  The third group of events in the ordering is formed by the call events
that happen at time $t$ and whose matching response events happen
after time $t$, ordered by their process identifiers. 

We now define sequential consistency,
linearizability, and hybrid consistency.  These definitions
all imply that every
call eventually has
a matching response and that call events and response events alternate
at a given process (a process never has more than one pending 
call).  If $s$ is
a sequence of operations and $p$ is a process, then we denote the restriction
of $s$ to operations of process $p$ by $s | p$.  Given an execution $\rho$, a
sequence of operations $s$ is a {\em serialization\/} of $\rho$ if it is a
permutation of $\ops(\rho)$.

Sequential consistency ensures that all processes agree on some ordering of
the execution at the granularity of entire operations.  In this ordering, the
operations for each process appear in the order in which they were executed
at that process.
An execution $\rho$ is {\em sequentially consistent\/} if there exists a legal
serialization $\tau$ of $\rho$
such that $\ops(\rho)|p = \tau|p$ for each process $p$.

Like sequential consistency, linearizability ensures that all processes agree
on some ordering of the execution (at the granularity of entire operations) that
preserves the operation sequences of the processes.  In addition, the ordering
must preserve the actual timings of operations.
An execution $\rho$ is {\em linearizable\/} if there exists a legal 
serialization $\tau$ of $\rho$ such that
$\ops(\rho)|p = \tau|p$ for each process $p$, and
$\op_1$ precedes $\op_2$ in $\tau$ if the 
response for $\op_1$ precedes the call for $\op_2$ in $\ops(\rho)$.

Hybrid consistency tries to give us the best of strong consistency
(operations appear to execute everywhere in some fixed order) and weak
consistency (there is no fixed order,
admitting fast implementations).  Each operation has a strong version and
a weak version.  We want all processes to perceive
the same order of execution for all strong operations and
the same relative order of execution for each pair of operations
executed by the same process, where at least one operation in the pair is
strong.

An execution $\rho$ is {\em hybrid consistent\/} if there exists a serialization
$\sigma$ of the strong operations of $\rho$ such that for each process $p$,
there exists a legal sequence $\tau_p$ of operations such that the following
four statements are true:
(1) $\tau_p$ is a permutation of $\ops(\rho)$.
(2) If $\op_1$ and $\op_2$ are executed by
the same process, $\op_1$ precedes $\op_2$ 
in $\rho$, and at least one of $\op_1$
and $\op_2$ is a strong operation, then $\op_1$ precedes $\op_2$ in $\tau_p$.
(3) If $\op_1$ precedes $op_2$ in $\sigma$ and both are strong,
then $\op_1$ precedes $op_2$ in $\tau_p$.
(4) We have $\tau_p | p = \rho | p$.

An MCS is a sequentially consistent (respectively, linearizable or hybrid
consistent) implementation
of a set of objects if any admissible execution of the MCS is sequentially
consistent (respectively, linearizable or hybrid consistent).

We measure the efficiency of an implementation by the worst-case response
time for any operation on any object in the set.  Let $O$ be an object and
$\OP$ be a generic operation.  $|\OP(O)|$ is the maximum time taken by
an $\op$ operation
on $O$ in any admissible execution.  $|\OP|$, the worst-case time for
the generic operation $\OP$ to
be completed, is the maximum of $|\OP(O)|$ 
over all objects implemented by the MCS.

\section{Perfect clocks}
\label{perfect}
In this section we assume that all processes have perfectly synchronized
(perfect) clocks and a constant, known message delay $d$. (These two concepts
are 
equivalent; see~\cite{cj99-09-05}).
We model constant message delay by letting the message uncertainty be
$u = 0$.  We give lower and upper bounds on the costs of providing
sequentially consistent and linearizable implementations of virtual shared
objects.  These lower bounds hold under weaker, more realistic
assumptions about process synchrony.
We prove our lower bounds on the costs of operations in sequentially 
consistent implementations, implying corresponding lower bounds for
linearizable implementations.  Perfect clocks are necessary for the algorithms
demonstrating our upper bounds.  Our algorithms are primarily of theoretical
interest.

Subsection~\ref{spt-lb} describes lower bounds on the costs of implementing
single operations, pairs of operations, and larger groups of operations from
abstract
data types.  Subsection~\ref{lb-all} presents lower bounds on the costs of
implementing all operations from abstract data types.
Subsection~\ref{opt-one} contains implementations
of classes of abstract data types in which a single operation or group of
operations is optimized and describes
abstract data types for which the bounds in Subsection~\ref{spt-lb} are tight.

\subsection{Lower bounds for sequential consistency for singles, pairs, and
groups}
\label{spt-lb}
We give lower bounds on the costs in sequentially consistent implementations
of single operations, pairs of operations, and larger groups of operations
satisfying certain algebraic properties, which
we present as needed.  The
properties are also useful in Section~\ref{study-hc}, where we consider
hybrid consistency.

If $\beta$ and $\gamma$ are
operation sequences, then $\beta$ and $\gamma$ 
	{\em commute\/}~\cite{cj99-09-18}
if, for every operation sequence $\alpha$ such that 
$\alpha \circ \beta$ and $\alpha \circ \gamma$ are legal, 
$\alpha \circ \beta \circ \gamma$ and $\alpha \circ \gamma \circ \beta$ are
legal and equivalent.\footnote{The 
concept is called {\em commute forward\/} in~\cite{cj99-09-18}.}

Formally we say that two generic operations {\em do not commute\/} 
by negating
the previous definition.  $\OP_1$ and $\OP_2$ do not commute if 
there exist operation instances $\op_1$ and $\op_2$ 
and a sequence of operations
$\alpha$ such that $\alpha \circ \op_1$
and $\alpha \circ \op_2$ are legal and (at least) one of the following is true:
\begin{enumerate}
\item $\alpha \circ \op_1 \circ \op_2$ is not legal.
\item $\alpha \circ \op_2 \circ \op_1$ is not legal.
\item $\alpha \circ \op_1 \circ \op_2$ does not look like 
$\alpha \circ \op_2 \circ \op_1$, which means that
there exists an operation sequence $\gamma$ such that
$\alpha \circ \op_1 \circ \op_2 \circ \gamma$
is legal but $\alpha \circ \op_2 \circ \op_1 \circ \gamma$ is not.
\item $\alpha \circ \op_2 \circ \op_1$ does not look like
$\alpha \circ \op_1 \circ \op_2$, which means that
there exists an operation sequence $\gamma$ such that
$\alpha \circ \op_2 \circ \op_1 \circ \gamma$
is legal but $\alpha \circ \op_1 \circ \op_2 \circ \gamma$ is not.
\end{enumerate}

If item 1 or item 2 is true, then $\OP_1$ and
$\OP_2$ {\em immediately do not commute}.\footnote{If $\op_1$ and $\op_2$ are
the same instantiation of the same generic operation $\OP$, then $\OP$
{\em immediately does not commute with itself.}}
If item 3 or item 4 is true, then
$\OP_1$ and $\OP_2$ {\em eventually do not commute}.
If items 1 and 2 are true, $\OP_1$ and $\OP_2$
are {\em cyclically dependent}.
Appendix~\ref{adt-examples} contains several examples of abstract data types
with their commutativity properties.

The proofs of the theorems in this subsection are similar to each other.
They show that some legal executions, each performed by a single process, may
not be combined into a globally legal execution if minimum time bounds for
certain operations are not obeyed.  This results in a violation of sequential
consistency.
We justify the existence of these legal executions and our
combining technique in Appendix~\ref{axioms}.  With each proof, we
provide a picture of the illegal global
execution.
In each picture, time moves from left to right, with critical times shown at
the bottom, and each legal individual execution has its own line.  Also, the
periods of time for message transit are shown.  For each operation (sequence),
we show the object on which it is performed and the process performing it.
Each process completes its operations based on
the information available when the operations are executed.  We
choose starting times for operations so that the finishing times 
(based on the lower bounds) are too small for the processes to use
the information in messages sent by other processes.

We now give a condition under which an {\it individual\/} (generic)
operation of an abstract data type must be slow.
\begin{theorem}
\label{slow-op}
Let $T$ be an abstract data type with a generic 
operation $\OP$ that immediately does
not commute with itself.  In any
sequentially consistent implementation of objects of type $T$, $|\OP| \ge d$.
\end{theorem}

\begin{proof}
The following proof generalizes the proof in~\cite{cj99-09-05} that a dequeue
operation of the queue abstract data type must take at least time $d$.
Let $A$ be an object of type $T$.  Let processes $1$ and $2$
access $A$.
Since $\OP$ immediately does not commute with itself, there exist
a sequence $\rho$ of operations and an operation instance $op$ such that
$\rho \circ \op$ is legal but
$\rho \circ \op \circ op$ is not legal.
Suppose in contradiction that there is a sequentially consistent
implementation of $A_\rho$
for which $|OP| < d$.\footnote{The use of the explicit initialization
assumption is necessary in this proof, because we must use
serialized operation sequences of the form $\rho \circ \gamma$ to prove a
violation of sequential consistency, and the definition of sequential
consistency does not require that $\rho$ appear as a prefix of the
serialization of the execution.  The necessity of the explicit initialization
assumption is an open question.}


Figure~\ref{t:t1} shows possible admissible executions $\alpha_1$ and $\alpha_2$.
Since no messages are received
in $\alpha_1$ and $\alpha_2$, replacing $p_2$'s history in $\alpha_1$
with its history in $\alpha_2$ results in another admissible execution,
$\alpha$.
By assumption, $\alpha$ is sequentially consistent.  Thus, there is a
$\tau$ that 
is a permutation of $\ops(\alpha)$ and is legal for $A_{\rho}$.
However, all possible permutations of $\ops(\alpha)$ are of the form
$\op \circ \op$, which is not legal for $A_{\rho}$.
\end{proof}

\begin{figure}[tbh]
\begin{center}
\setlength{\unitlength}{0.00083300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(6150,3360)(1426,-3163)
\thicklines
\put(2401,-2761){\vector( 1, 0){4800}}
\put(2851,-2611){\line( 0,-1){300}}
\put(6676,-2611){\line( 0,-1){300}}
\put(2851,-1936){\line( 1, 0){3825}}
\put(2851,-1786){\line( 0,-1){300}}
\put(6676,-1786){\line( 0,-1){300}}
\put(2851,-1186){\line( 1, 0){1725}}
\put(2851,-1036){\line( 0,-1){300}}
\put(4576,-1036){\line( 0,-1){300}}
\put(2851,-211){\line( 0,-1){300}}
\put(4576,-211){\line( 0,-1){300}}
\put(2851,-361){\line( 1, 0){1725}}
\put(1426, 89){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution}}}
\put(1651,-361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_2$}}}
\put(1651,-1186){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_1$}}}
\put(3301,-1711){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $\alpha_1$ and $\alpha_2$}}}
\put(1426,-2761){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(2851,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}0}}}
\put(6676,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$d$}}}
\put(3301,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op[A_\rho,2]$}}}
\put(3301,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op[A_\rho,1]$}}}
\end{picture}
\end{center}
\caption{Counterexample for lower bound for an operation that immediately 
does not commute with itself (Theorem~\ref{slow-op})}
\label{t:t1}
\end{figure}%2

For read/write objects, $\writeit(v)()$
immediately commutes with
$\writeit(w)()$, but the two writes eventually do not commute if $v \ne w$.
Therefore, the previous
theorem does not apply 
for $|\WRITE|$.  In fact, ~\cite{cj99-09-05,cj99-09-17}
present algorithms in which writes take less than time $d$.

Next we give a condition under which a {\it pair\/}
of distinct (generic) operations from an abstract data type must be slow.
\begin{theorem}
\label{sc-pair}
Let $T$ be an abstract data type, and let
$\OP_1$ and $\OP_2$ be distinct generic operations
on $T$ that immediately do not
commute.  In any sequentially consistent implementation of at least
two objects of type $T$,
$|\OP_1| + |\OP_2| \ge d$.
\end{theorem}

\begin{proof}
This proof generalizes the proof in~\cite{cj99-09-05} that
$|\READ| + |\WRITE| \ge d$ for read/write objects.
Since $\OP_1$ and $\OP_2$ immediately do not commute, there is a sequence of
operations $\alpha$ and operation instances $\op_1$ and $\op_2$ such that
$\alpha \circ \op_1$ and $\alpha \circ \op_2$ are
legal, but (without loss of generality) 
$\alpha \circ \op_1 \circ \op_2$ is not legal.
Let $A$ and $B$ be two objects of type $T$, and let processes $1$ and $2$ use
$A$ and $B$.  Suppose in contradiction that there is a sequentially consistent
implementation of $A$ and $B$ for which
$|\OP_1| + |\OP_2| < d$.

Figure~\ref{t:t2} shows possible admissible executions $\alpha_1$ and $\alpha_2$.
Since no messages are received in $\sigma_1$ and $\sigma_2$ after time $t$,
replacing process $2$'s history in $\sigma_1$ with its history in $\sigma_2$
results in another admissible execution, $\sigma$.  Then $\ops(\sigma)$
consists of the operations $\op_1[A,1]$ followed by $\op_2[B,1]$, and $\op_1[B,2]$
followed by $\op_2[A,2]$, where both pairs are preceded by $\alpha[A,1]$ and
$\alpha[B,2]$.

By assumption, $\sigma$ is sequentially consistent.  Thus there exists a legal
operation sequence $\tau$ in which the following hold:
\begin{itemize}
\item The operations in $\alpha[A,1]$ are followed by $\op_1[A,1]$, and
$\op_1[A,1]$ is followed by $\op_2[B,1]$.
\item The operations in $\alpha[B,2]$ are followed by $\op_1[B,2]$, and
$\op_1[B,2]$ is followed by $\op_2[A,2]$.
\end{itemize}
Since $\alpha \circ \op_1 \circ \op_2$ is not legal, $\tau$ must have $\op_1[A,1]$
follow $\op_2[A,2]$.  But that causes $\op_2[B,1]$ to follow $\op_1[B,2]$, which is
not legal.
\end{proof}

\begin{figure}[tbh]
\begin{center}
\setlength{\unitlength}{0.00063300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(7737,3594)(1,-3397)
\thicklines
\put(826,-1186){\line( 1, 0){1725}}
\put(2551,-1036){\line( 0,-1){300}}
\put(826,-361){\line( 1, 0){1725}}
\put(826,-211){\line( 0,-1){300}}
\put(2551,-211){\line( 0,-1){300}}
\put(826,-1036){\line( 0,-1){300}}
\put(826,-2761){\vector( 1, 0){6900}}
\put(2776,-1936){\line( 1, 0){4125}}
\put(2776,-1786){\line( 0,-1){300}}
\put(4726,-1186){\line( 1, 0){1125}}
\put(4726,-361){\line( 1, 0){1125}}
\put(2776,-1186){\line( 1, 0){1725}}
\put(2776,-1036){\line( 0,-1){300}}
\put(4501,-1036){\line( 0,-1){300}}
\put(2776,-361){\line( 1, 0){1725}}
\put(2776,-211){\line( 0,-1){300}}
\put(4501,-211){\line( 0,-1){300}}
\put(2776,-2611){\line( 0,-1){300}}
\put(4726,-1036){\line( 0,-1){300}}
\put(4726,-2611){\line( 0,-1){300}}
\put(5851,-211){\line( 0,-1){300}}
\put(4726,-211){\line( 0,-1){300}}
\put(5851,-1036){\line( 0,-1){300}}
\put(5851,-2611){\line( 0,-1){300}}
\put(6901,-2611){\line( 0,-1){300}}
\put(6901,-1786){\line( 0,-1){300}}
\put(226,-361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\sigma_2$}}}
\put(226,-1186){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\sigma_1$}}}
\put(  1, 89){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution}}}
\put(  1,-2761){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(1101,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha[A,1]\alpha[B,2]$}}}
\put(1101,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha[A,1]\alpha[B,2]$}}}
\put(3126,-1711){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $op_1$ and $op_2$}}}
\put(3226,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op_1[A,2]$}}}
\put(3226,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op_1[B,1]$}}}
\put(2776,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t$}}}
\put(4526,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t+|\OP_1|$}}}
\put(5651,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t+|\OP_1|+$}}}
\put(5651,-3411){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$|\OP_2|$}}}
\put(4951,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op_2[B,2]$}}}
\put(4951,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op_2[A,1]$}}}
\put(6901,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t+d$}}}
\end{picture}
\end{center}
\caption{Counterexample for lower bound for a pair of operations that 
immediately do not commute (Theorem~\ref{sc-pair})}
\label{t:t2}
\end{figure}%3

Cyclic dependences cause a pair of operations
to be even slower than the previous lower bound.
\begin{theorem}
\label{cd-pair}
Let $T$ be an abstract data type with cyclically dependent generic
operations $\OP_1$ and $\OP_2$. 
In any sequentially consistent implementation of an object
of type $T$, $|\OP_1| + |\OP_2| \ge 2d$.
\end{theorem}

\begin{proof}
Since $\OP_1$ and $\OP_2$
are cyclically dependent, there is a sequence of
operations $\rho$ and operation instances $\op_1$ and $\op_2$ such that
$\rho \circ \op_1$
and $\rho \circ \op_2$ are legal, but
$\rho \circ \op_1 \circ \op_2$ and
$\rho \circ \op_2 \circ \op_1$ are not legal.
Let $A$ be an object of type $T$.  Let processes $1$ and $2$ access $A$.
Assume in contradiction that there exists a sequentially consistent
implementation of $A_\rho$ for which
$|\OP_1| + |\OP_2| < 2d$.  We can assume without loss
of generality that $|\OP_1| \ge |\OP_2|$.

Figure~\ref{t:t3} shows possible 
admissible executions $\alpha_1$ and $\alpha_2$.
Since no messages are received in $\alpha_1$ and $\alpha_2$ before time
$d$,
replacing process $1$'s history in $\alpha_2$ with its history in $\alpha_1$
results in another admissible and sequentially consistent execution, $\alpha$.
Thus, there is a legal permutation of $\ops(\alpha)$
for $A_{\rho}$.  However, because of cyclic dependency,
neither permutation of
$\ops(\alpha)$ is legal for $A_{\rho}$.
\end{proof}

\begin{figure}[tbh]
\begin{center}
\setlength{\unitlength}{0.00083300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(5787,3360)(1426,-3163)
\thicklines
\put(2401,-2761){\vector( 1, 0){4800}}
\put(2851,-2611){\line( 0,-1){300}}
\put(6676,-2611){\line( 0,-1){300}}
\put(2851,-1936){\line( 1, 0){3825}}
\put(2851,-1786){\line( 0,-1){300}}
\put(6676,-1786){\line( 0,-1){300}}
\put(2851,-1186){\line( 1, 0){1725}}
\put(2851,-1036){\line( 0,-1){300}}
\put(4576,-1036){\line( 0,-1){300}}
\put(2851,-211){\line( 0,-1){300}}
\put(4576,-211){\line( 0,-1){300}}
\put(2851,-361){\line( 1, 0){1725}}
\put(1651,-361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_2$}}}
\put(1651,-1186){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_1$}}}
\put(3301,-1711){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $\alpha_1,\alpha_2$}}}
\put(1476,-2761){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(2851,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}0}}}
\put(6676,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$d$}}}
\put(3301,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op[A_\rho,2]$}}}
\put(3301,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op[A_\rho,1]$}}}
\put(1426, 89){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution}}}
\end{picture}
\end{center}
\caption{Counterexample for lower bound for a pair of cyclically 
dependent operations (Theorem~\ref{cd-pair})}
\label{t:t3}
\end{figure}%4

Dequeue and enqueue are not cyclically dependent, 
because an enqueue is always
legal.  Likewise, read and write are not cyclically dependent because
a write is always legal.
\cite{cj99-09-05} shows 
that the lower bounds of Theorem~\ref{sc-pair} are tight.

We now give an example of
a type, BANKACCOUNT2,
with cyclically dependent operations.  The object of the type consists of
values denoting savings and checking account balances.
The operations are $\RSUC$ and $\RCUS$.
$\rsuc(v)(w)$
adds $v$ to the checking account balance and returns the value of the
savings account balance in $w$.  $\rcus(v)(w)$ adds
$v$ to the savings account balance and returns the value of the checking
account balance in $w$.  $\RSUC$ and $\RCUS$ are cyclically dependent.
Initialize the checking account balance to 500 and the savings
account balance to 1000.  An $\rsuc$ instance executed alone would return
1000, and an $\rcus$ instance executed alone would return 500.  If the $\rcus$
instance followed the $\rsuc$ instance, the $\rcus$ instance would not be
legal, and vice versa.

We generalize cyclic dependence for larger groups of operations.
If there exists an operation sequence $\alpha$ and operation instances
$\op_1,\ldots,\op_n$ such that $\alpha \circ \op_1$, $\ldots$,
$\alpha \circ \op_n$ are legal, but each operation sequence formed by
$\alpha$ followed by a permutation of $\op_1,\ldots,\op_n$ is illegal, then
$\OP_1,\ldots,\OP_n$ are {\em $n$-cyclically dependent}.
The previous definition of cyclic dependence corresponds to 2-cyclic
dependence.

\begin{theorem}
\label{not-all-less}
Let $T$ be an abstract data type with operations $\OP_1,\ldots,\OP_n$.
If $\OP_1,\ldots,\OP_n$ are $n$-cyclically dependent, then one of
$|\OP_1|,\ldots,|\OP_n|$ must be at least $d$ in any sequentially
consistent implementation 
of objects of type $T$, where at least $n$ processes
are using the objects.
\end{theorem}

\begin{proof}
Let $A$ be an object of type $T$.  Let processes $1,\ldots,n$ access $A$.
Since $\OP_1,\ldots,\OP_n$ are $n$-cyclically
dependent, there exist an operation sequence $\rho$ and operation instances
$\op_1,\ldots,\op_n$ such that they can individually legally follow $\rho$.
Assume in contradiction that there exists an implementation of $A_\rho$ where
$|\OP_1| < d,\ldots,|\OP_n| < d$,
but they cannot all together legally follow~$\rho$.

Figure~\ref{t:t4} shows admissible executions
$\alpha_1,\ldots,\alpha_n$.
Since no messages are received in $\alpha_1,\ldots,\alpha_n$,
for each $i > 1$, replacing $p_i$'s history in $\alpha_1$ with its history in
$\alpha_i$ results in another admissible and sequentially consistent execution,
$\alpha$.  Thus, there is a $\tau$ that is a
permutation of $\ops(\alpha)$ and is legal for $A_\rho$.  However, no possible
permutation of $\ops(\alpha)$ is legal for $A_\rho$.
\end{proof}

\begin{figure}[tbh]
\begin{center}
\setlength{\unitlength}{0.00083300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(5787,3360)(1426,-3163)
\thicklines
\put(2401,-2761){\vector( 1, 0){4800}}
\put(2851,-2611){\line( 0,-1){300}}
\put(6676,-2611){\line( 0,-1){300}}
\put(2851,-1936){\line( 1, 0){3825}}
\put(2851,-1786){\line( 0,-1){300}}
\put(6676,-1786){\line( 0,-1){300}}
\put(2851,-1186){\line( 1, 0){1725}}
\put(2851,-1036){\line( 0,-1){300}}
\put(4576,-1036){\line( 0,-1){300}}
\put(2851,-211){\line( 0,-1){300}}
\put(4576,-211){\line( 0,-1){300}}
\put(2851,-361){\line( 1, 0){1725}}
\put(1651,-361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_n$}}}
\put(1651,-1186){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_1$}}}
\put(3301,-1711){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $\alpha_1,...,\alpha_n$}}}
\put(1476,-2761){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(2851,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}0}}}
\put(6676,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$d$}}}
\put(3301,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op[A_\rho,n]$}}}
\put(3301,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op[A_\rho,1]$}}}
\put(1426, 89){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution}}}
\end{picture}
\end{center}
\caption{Counterexample for lower bound for $n$-cyclically dependent 
operations (Theorem~\ref{not-all-less})}
\label{t:t4}
\end{figure}%5

We now give an example of a type, BANKACCOUNT3, with $3$-cyclically dependent
operations. The object of the type consists of
values denoting savings, checking, and loan account balances.
The
operations are $\RSUC$, $\RCUL$, and $\RLUS$.
$\rsuc(v)(w)$
adds $v$ to the checking account balance and returns the value of the
savings account balance in $w$.  $\rcul(v)(w)$ adds
$v$ to the loan account balance and returns the value of the checking
account balance in $w$.
$\rlus(v)(w)$ 
adds $v$ to the savings account balance and returns the value of
the loan account balance in $w$.  Initialize the savings account balance to
500, the checking account balance to 1000, and the loan account balance to
1500.  An $\rsuc$ instance executed alone would return 500, an $\rcul$ instance
executed alone would return 1000, and an $\rlus$ instance would return 1500.
In any permutation of all three instances, at least one instance would be
illegal if some account balance has changed.

The number of accessing processes is important.  If only two processes
access objects of type BANKACCOUNT3, there exists an implementation of
BANKACCOUNT3
where $|\RSUC| = d/2$, $|\RCUL| = d/2$, and $|\RLUS| = d/2$, matching the lower
bound of $3d/2$ given in Theorem~\ref{clique}.  The implementation described
in Theorem~\ref{half-all} suffices.

The next theorem gives a larger lower bound for $3$-cyclic operations.

\begin{theorem}
\label{3c2d}
If $T$ is an abstract data type with operations $\OP_1$, $\OP_2$, 
and $\OP_3$ that
are $3$-cyclic, then $|\OP_1| + |\OP_2| + |\OP_3| \ge 2d$ in any sequentially
consistent implementation of objects of type $T$, where there are at least
three accessing processes.
\end{theorem}

\begin{proof}
By Theorem~\ref{not-all-less}, at least one of $|\OP_1|$, $|\OP_2|$, 
and $|\OP_3|$
must be at least $d$.  Without loss of generality, assume 
that $|\OP_1| \ge d$.
Assume in contradiction that $|\OP_2| + |\OP_3| < d$.  We consider $A_\rho$,
the $\rho$-initialized version of $A$.

Figure~\ref{t:t5} shows admissible executions $\alpha_1$, $\alpha_2$, and
$\alpha_3$.
Since no messages are received in $\alpha_1$, $\alpha_2$, and $\alpha_3$ before
time $d$, replacing process 1's history in $\alpha_3$ with its history
in $\alpha_1$ and replacing process 2's history in $\alpha_3$ with its
history in $\alpha_2$ results 
in another admissible and sequentially consistent
execution $\alpha$.
Thus, there is a legal permutation
of $\ops(\alpha)$ for $A_\rho$.  However, no possible permutation
of $\ops(\alpha)$ is legal for $A_\rho$.
\end{proof}

\begin{figure}[tbh]
\begin{center}
\setlength{\unitlength}{0.00063300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(7087,5810)(1426,-5613)
\thicklines
\put(4126,-211){\line( 0,-1){300}}
\put(5251,-211){\line( 0,-1){300}}
\put(4126,-361){\line( 1, 0){1125}}
\put(4726,-4711){\line( 0,-1){300}}
\put(5326,-4711){\line( 0,-1){300}}
\put(6601,-4711){\line( 0,-1){300}}
\put(3526,-2686){\line( 1, 0){3675}}
\put(3526,-2536){\line( 0,-1){300}}
\put(3526,-1936){\line( 1, 0){3075}}
\put(6601,-1786){\line( 0,-1){300}}
\put(2851,-3286){\line( 1, 0){3750}}
\put(2851,-3136){\line( 0,-1){300}}
\put(6601,-3136){\line( 0,-1){300}}
\put(2851,-4036){\line( 1, 0){3225}}
\put(6076,-3886){\line( 0,-1){300}}
\put(4126,-1036){\line( 0,-1){300}}
\put(7801,-1036){\line( 0,-1){300}}
\put(2851,-3886){\line( 0,-1){300}}
\put(3526,-1786){\line( 0,-1){300}}
\put(2851,-4711){\line( 0,-1){300}}
\put(6076,-4711){\line( 0,-1){300}}
\put(3526,-4711){\line( 0,-1){300}}
\put(4126,-4711){\line( 0,-1){300}}
\put(7201,-2536){\line( 0,-1){300}}
\put(7201,-4711){\line( 0,-1){300}}
\put(7801,-4711){\line( 0,-1){300}}
\put(4126,-1186){\line( 1, 0){3675}}
\put(2401,-4861){\vector( 1, 0){6100}}
\put(1651,-361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_3$}}}
\put(4126,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op_3[A_\rho,2]$}}}
\put(1426, 89){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution}}}
\put(1426,-4861){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(2851,-5236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}0}}}
\put(4326,-4600){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\frac{|\OP_1|+|\OP_3|}{2}$}}}
\put(3776,-5236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\frac{|\OP_1|-|\OP_3|}{2}$}}}
\put(5026,-5236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\frac{|\OP_1|+|\OP_2|}{2}$}}}
\put(6076,-4600){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$d$}}}
\put(6426,-5236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$|\OP_1|$}}}
\put(1651,-1936){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_2$}}}
\put(3751,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $\op_2$}}}
\put(1651,-3286){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_1$}}}
\put(3301,-3061){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op_1[A_\rho,1]$}}}
\put(3976,-1711){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\op_2[A_\rho,2]$}}}
\put(3201,-3811){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $\op_1$}}}
\put(4301,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $\op_3$}}}
\put(3201,-4600){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\frac{|\OP_1|-|\OP_2|}{2}$}}}
\put(6726,-4600){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\frac{|\OP_1|-|\OP_2|}{2}+d$}}}
\put(7426,-5236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\frac{|\OP_1|-|\OP_3|}{2}+d$}}}
\end{picture}
\caption{Counterexample for lower bound for 3-cyclically 
dependent operations (Theorem~\ref{3c2d})}
\end{center}
\label{t:t5}
\end{figure}%6

We now define a condition causing
either a pair of operations or an individual operation in a trio of
operations to be slow.
A generic operation $\OP$ is
{\em noninterleavable with respect to $\OP_1$ preceding $\OP_2$}
if there exist operation sequence $\rho$ and operation instances
$\op$, $\op_1$, and $\op_2$ such that $\rho \circ \op$
and $\rho \circ \op_1 \circ \op_2$ are legal, but
none of $\rho \circ \op \circ \op_1 \circ \op_2$,
$\rho \circ \op_1 \circ \op \circ \op_2$, and
$\rho \circ \op_1 \circ \op_2 \circ \op$ is legal.  

\begin{theorem}
\label{sc-big-cycle}
Let $T$ be an abstract data type, and let $\OP_1$, $\OP_2$, and $\OP_3$ be
generic
operations of $T$ such that $\OP_3$ is 
noninterleavable with respect to $\OP_1$
preceding $\OP_2$.
Then in any sequentially consistent implementation of objects of type $T$,
$|\OP_1| + |\OP_2| \ge d$ or $|\OP_3| \ge d$.
\end{theorem}

\begin{proof}
Since $\OP_3$ is 
noninterleavable with respect to $\OP_1$ preceding $\OP_2$, there
exists an operation sequence $\rho$ and operation instances $\op_1$, $\op_2$,
and $\op_3$ such that $\rho \circ \op_3$ and $\rho \circ \op_1 \circ \op_2$
are legal, but none of $\rho \circ \op_3 \circ \op_1 \circ \op_2$,
$\rho \circ \op_1 \circ \op_3 \circ \op_2$, and
$\rho \circ \op_1 \circ \op_2 \circ \op_3$ is legal.

Let $A$ be an object of type $T$.  Let processes $1$ and $2$ access $A$.
Assume in contradiction that there exists a sequentially consistent
implementation of $A_\rho$ for which $|OP_1| + |\OP_2| < d$ and
$|OP_3| < d$.

Figure~\ref{t:t6} shows admissible executions $\alpha_1$ and $\alpha_2$.
Since no messages are received in $\alpha_1$ and $\alpha_2$,
replacing process $1$'s history in $\alpha_2$ with its history in $\alpha_1$
results in another admissible and sequentially consistent execution, $\alpha$.
Thus, there is a permutation of $\ops(\alpha)$,
$\tau$, which preserves the order of process $1$'s
operations in $\alpha$ and is legal for $A_{\rho}$.  However, because of
noninterleavability, none of the three permutations of
$\ops(\alpha)$ that satisfy the order of process $1$'s operations is
legal for $A_{\rho}$.
\end{proof}

\begin{figure}[tbh]
\begin{center}
\setlength{\unitlength}{0.00083300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(5787,3360)(1426,-3163)
\thicklines
\put(2401,-2761){\vector( 1, 0){4800}}
\put(2851,-2611){\line( 0,-1){300}}
\put(6676,-2611){\line( 0,-1){300}}
\put(2851,-1936){\line( 1, 0){3825}}
\put(2851,-1786){\line( 0,-1){300}}
\put(6676,-1786){\line( 0,-1){300}}
\put(2851,-1186){\line( 1, 0){1725}}
\put(4801,-1186){\line( 1, 0){1725}}
\put(2851,-1036){\line( 0,-1){300}}
\put(4576,-1036){\line( 0,-1){300}}
\put(6526,-1036){\line( 0,-1){300}}
\put(2851,-211){\line( 0,-1){300}}
\put(5251,-211){\line( 0,-1){300}}
\put(2851,-361){\line( 1, 0){2400}}
\put(4801,-1036){\line( 0,-1){300}}
\put(1651,-361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_2$}}}
\put(1651,-1186){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\alpha_1$}}}
\put(3301,-136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$op_3[A_\rho,2]$}}}
\put(3301,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$op_1[A_\rho,1]$}}}
\put(3301,-1711){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}earliest message transit for $op_1$ and $op_3$}}}
\put(1426,-2761){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(2851,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}0}}}
\put(6676,-3136){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$d$}}}
\put(1426, 89){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution}}}
\put(5251,-961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$op_2[A_\rho,1]$}}}
\end{picture}
\end{center}
\caption{Counterexample for noninterleavability lower 
bound (Theorem~\ref{sc-big-cycle})}
\label{t:t6}
\end{figure}%7

Consider our BANKACCOUNT2 objects, and define the additional
operations $\UC$
(which adds a certain amount 
to the checking account balance) and $\RS$ (which
reads and returns the value of the savings account balance).  $\RCUS$ is
noninterleavable with respect to $\UC$ preceding $\RS$.  Initialize the checking
account balance to 500 and the savings account balance to 1000.  An $\rcus$
instance executed alone would return 500, and an $\rs$ instance following a
$\uc$ instance would return 1000.  Including all three instances would make
either the $\rcus$ or the $\rs$ illegal, depending on where 
the $\rcus$ instance was placed.

\begin{table}
\caption{Corollaries for Subsection~\ref{spt-lb}\label{t:spt-cors}}
\begin{center}\footnotesize
\begin{tabular}{llll}\hline
Operations & Type/table references & Theorems & Citations\\ \hline
\it DEQ & Table~\ref{queue-rel} & ~\ref{slow-op} & ~\cite{cj99-09-05}\\ 
\it POP & Table~\ref{stack-rel} & ~\ref{slow-op} & ~\cite{cj99-09-05}\\ 
\it DEL & Table~\ref{bst-rel} & ~\ref{slow-op} & \\ 
\it WITHDRAW & Table~\ref{bank-acct} & ~\ref{slow-op} & \\ 
\it READ, WRITE & Read/write objects & ~\ref{sc-pair} & ~\cite{cj99-09-05,cj99-09-15}\\ 
Any two distinct generic & Tables~\ref{queue-rel}--\ref{bank-acct} & ~\ref{sc-pair} & \\
operations & & & \\ 
\it RSUC, RCUS &  BANKACCOUNT2 & ~\ref{cd-pair} & \\ 
\it RSUC, RCUL, RLUS & BANKACCOUNT3 & ~\ref{not-all-less},~\ref{3c2d} 
& \\ 
\it RCUS, UC, RS & BANKACCOUNT2 & ~\ref{sc-big-cycle} 
& \\ \hline
\end{tabular}
\end{center}
\end{table}

Table~\ref{t:spt-cors}
displays types applicable to the theorems in this subsection.
For each row of the table, the operation from the first column, which
is from the type or table reference listed in the second column, meets the
hypotheses for the theorem(s) listed in the third column.  Any references to a
specific lower bound corresponding to the given operation are
presented in the fourth column.

\subsection{Lower bounds for sequential consistency for all 
operations of a type}
\label{lb-all}
We now use the previous results
and the structure of the commutativity properties to deduce
lower bounds on the worst-case time complexity for {\it all\/}
operations of abstract data types in sequentially consistent implementations.

An alternate way to represent the commutativity properties of an abstract
data type $T$ is to use a
{\em commutativity graph} $\CG(T)$, with the
generic operations as nodes.  An edge exists between two nodes if their
corresponding operations immediately do not commute.   A loop exists at a
node if the corresponding operation immediately does not commute with itself.
Figure~\ref{t:cgpic} 
shows some examples of commutativity graphs.

\begin{center}
\begin{figure}[tbh]
\setlength{\unitlength}{0.00083300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\thicklines
\begin{picture}(5270,5673)(-11,-4822)
\put(4701,-136){\circle{7500}}
\put(1651,-3661){\circle{750}}
\put(3301,-1861){\circle{750}}
\put(4876,-3661){\circle{750}}
\put(1651,-136){\circle{750}}
\put(2026,-136){\line( 1, 0){2325}}
\put(3001,-2086){\line(-5,-6){1088.115}}
\put(3601,-2086){\line( 5,-6){1057.377}}
\put(4501,-3736){\line(-1, 0){2475}}
\put(1351,4){\line( 0, 1){385}}
\put(1351,389){\line( 1, 0){600}}
\put(1951,389){\line( 0,-1){385}}
\put(1426,-211){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}delete}}}
\put(4426,-211){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}enqueue}}}
\put(1450,-1111){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}(a) Commutativity 
    graph for queue operations}}}
\put(1070,-4786){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}(b) Commutativity 
    graph for reference-count set operations}}}
\put(1376,-3736){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}update}}}
\put(4726,-3736){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}find}}}
\put(3076,-1936){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}insert}}}
\end{picture}
\caption{Examples of commutativity graphs}
\label{t:cgpic}
\end{figure}%8
\end{center}

$\NC(T)$ is the subset of nodes in $\CG(T)$ such that each node's
corresponding operation immediately does not commute with itself.
$\RCG(T)$ (the reduced commutativity graph for $T$) is the subgraph of
$\CG(T)$ formed by deleting all nodes in $\NC(T)$ and their incident edges.
$\Maxdom(\RCG(T))$ is a subgraph of $\RCG(T)$ formed by a maximum
independent edge dominating set of $\RCG(T)$. Notice that a maximum
independent
edge dominating set of a graph is a largest subset of edges of the graph
such that
distinct edges in the subset do not have nodes in common and all other edges in
the graph have a node in common with one of the edges in the set.  It is clear
that the size of a maximum independent edge dominating set is at most half the
number of nodes in the graph.

This result gives lower bounds on the costs of implementing abstract data types
with general commutativity graphs.

\begin{theorem}
\label{cgt}
Let $T$ have operations $\OP_1, \OP_2, \ldots, \OP_n$
and commutativity graph $\CG(T)$.  In 
any sequentially consistent implementation of 
$T$, 
$$
\sum_{i=1}^{n} |\OP_i| 
\ge (|\NC(T)| + |\Maxdom(\RCG(T))|) d.
$$
\end{theorem}

\begin{proof}
If $\OP_i$ immediately does not commute with itself, then
$|\OP_i| \ge d$
by Theorem~\ref{slow-op}.
Thus, \(\sum_{\OP_i \in \NC(T)} |\OP_i| \)
$\ge |\NC(T)| d$.
Let $(\OP_i,\OP_j)$ be an edge in $\Maxdom(\RCG(T))$.
By the definition of
$\Maxdom(\RCG(T))$, $\OP_i$ and $\OP_j$ immediately do not commute.
By Theorem~\ref{sc-pair},
$|\OP_i| + |\OP_j| \ge d$.
Adding these inequalities yields the desired result.
\end{proof}

We now give a lower bound on the time required for all operations of an
abstract data type with a {\it clique\/} in its commutativity graph.

\begin{theorem}
\label{clique}
Let $T$ have operations $\OP_1, \OP_2, \ldots, \OP_n$
such that for all $i \in \{1,\ldots,n\}$, $\OP_i$ immediately does not
commute with $\OP_j$ if $i \ne j$.  In any sequentially consistent
implementation of $T$,
\(\sum_{i=1}^{n} |\OP_i| \ge |\NC(T)| d+ (n-|\NC(T)|) d/2 \).
\end{theorem}

\begin{proof}
Let $s = |\NC(T)|$.
Let $\OP_{i_1}, \ldots, \OP_{i_s}$ be the operations 
that immediately do not
commute with themselves.  By Theorem~\ref{slow-op},
$|\OP_{i_k}| \ge d$ for all $k$ in $\{1,\ldots,s\}$.  Thus,
\(\sum_{k=1}^{s} |\OP_{i_k}| \ge s d \) .  Let
$\OP_{j_1}, \ldots, \OP_{j_{n-s}}$
be the remaining operations.  We can assume without loss of generality that
$t = n-s > 2$, because Theorem~\ref{sc-pair} handles the $t=2$ case.
Since $\OP_{j_a}$ and $\OP_{j_b}$ 
do not commute for all $a,b$ in $\{1,\ldots,t\}$
where $a \ne b$,
by Theorem~\ref{sc-pair}, we get a system of
$C(t,2) = t(t-1)/2$ inequalities
of the form $|\OP_{j_a}| + |\OP_{j_b}| \ge d$, where
$a \neq b$.\footnote{$C(t,2)$ means the
number of ways to
choose two distinct items from a set of $t$ items.}

We want to minimize  \(\sum_{k=1}^{t} |\OP_{j_k}| \) given the above
constraints and the constraints $|\OP_{j_k}| \ge 0$ for all
$k \in \{1,\ldots,t\}$.  We want to minimize the sum, because
we want the the sum of each $|\OP_{j_a}|$ 
and $|\OP_{j_b}|$ in the global sum
to be as close as possible to $d$ from the corresponding individual
constraint.  If the system of equations of the form
$|\OP_{j_a}| + |\OP_{j_b}| = d$ 
has a solution, the solution would yield a lower
bound for \(\sum_{k=1}^{t} |\OP_{j_k}| \), because each $|OP_{j_k}|$ is
nonnegative.  Let us use the system of 
equations.  Without loss of generality,
we consider the following:
\begin{itemize}
\item $|\OP_{j_1}| + |\OP_{j_2}| = d$.
\item $|\OP_{j_1}| + |\OP_{j_3}| = d$.
\item $|\OP_{j_2}| + |\OP_{j_3}| = d$.
\end{itemize}
We can subtract the third equation from the second to yield the
equation $|\OP_{j_1}| - |\OP_{j_2}| = 0$.  Adding the new equation
to the first yields $2 |\OP_{j_1}| = d$.  We now get that
$|\OP_{j_1}| = d/2$.  It easily follows that
$|\OP_{j_k}| = d/2$ for all $k \in \{1,\ldots,t\}$.
Thus,
\(\sum_{k=1}^{t} |\OP_i| = \sum_{k=1}^{s} |\OP_{i_k}| +
\sum_{k=1}^{t} |\OP_{j_k}| \ge (n-s) d/2 \), as desired.
\end{proof}

Most of our lower bounds do not restrict the costs of
individual operations, provided that certain sums of costs exceed the lower
bounds.  Let us consider implementations of abstract data types where each
operation takes either time $0$ or time $d$.  Thus each implementation
corresponds
to a labeling of 
the commutativity graph where each node is labeled with a $0$
or a $d$, a 
{\em 0-$d$ assignment}.  The total cost of the implementation
equals $d$ multiplied by the number of $d$'s in the labeling.
We now show that the the minimum number of $d$'s required in a labeling
is the size of a minimum vertex cover
($\VC$).\footnote{A minimum vertex cover is a smallest subset 
of nodes of a graph 
such that each edge in the graph has an endpoint contained
in the subset.}

\begin{theorem}
\label{vcodt}
Let $T$ have operations $\OP_1, \ldots, \OP_n$.  In
any sequentially consistent implementation of $T$ corresponding to a 0-$d$
assignment, 
$$
\sum_{i=1}^{n} |\OP_i| \ge |\VC(\CG(T))|d.
$$
\end{theorem}

\begin{proof}
Assume in contradiction that there exists a sequentially consistent
implementation of $T$ corresponding to a 0-$d$ assignment such that
$$
\sum_{i=1}^{n} |\OP_i| < |\VC(\CG(T))| d.
$$
If $(\OP_i,\OP_j)$ is an edge in $\CG(T)$, then $\OP_i$ and 
$\OP_j$ immediately
do not commute by the definition of $\CG(T)$.  Thus 
$|\OP_i| + |\OP_j| \ge d$ by
Theorem~\ref{sc-pair}, implying that 
at least one of $\OP_i$ and $\OP_j$ must be
labeled with $d$ in the 0-$d$ assignment.  Let $S$ be the set of operations
labeled with $d$.  $|S| < |\VC(\CG(T))|$.
Since $\VC(\CG(T))$ is a smallest
set of nodes that cover the edges of $\CG(T)$, $S$ 
does not cover the edges.  Thus there exists an edge
$(\OP_i,\OP_j)$ in $\CG(T)$ such that 
$\OP_i$ and $\OP_j$ are not in $S$, that is,
they are labeled with $0$ in
the 0-$d$
assignment.  By the definition of $\CG(T)$, 
$\OP_i$ and $\OP_j$ immediately do not
commute, but
$|\OP_i| + |\OP_j| = 0$, violating Theorem~\ref{sc-pair}.
\end{proof}

Table~\ref{t:all-cors} displays types applicable to the theorems in this
subsection.  For each row of the table, the type referenced in the first
column meets the hypotheses of the theorem(s) listed in the second column,
with commutativity graph properties as shown in the remaining columns.

\begin{table}
\caption{Corollaries for Subsection~\ref{lb-all}}
\begin{center}
\begin{tabular}{lccccc} \hline
Type/Table reference & Theorems & $|\Maxdom|$ & $|\NC|$ & $n$ & 
$|\VC|$ \\ \hline
Table~\ref{bst-2-rel} & ~\ref{cgt},~\ref{vcodt} & 1 & & & 2 \\ 
Table~\ref{bst-2-rel} $+$ delete operation & ~\ref{clique} & 
 & 2 & 5 & \\ \hline
\end{tabular}
\end{center}
\label{t:all-cors}
\end{table}

\subsection{Upper bounds for linearizability}
\label{opt-one}
We know various algebraic properties of operations to prove lower bounds on
their worst-case completion times in linearizable implementations.  
Now we show
that the lower bounds are tight.  First we explain how certain classes of
operations and individual operations can be made fast.  Then we present
types where the sum of the worst-case completion times for all operations is
optimized.  Finally we discuss the effects of the results on the relative
costs of sequential consistency and linearizability.

\subsubsection{Optimizing a class of operations or a single operation}
The worst-case completion time of an
operation is at least $d$ if it immediately does not commute with itself.
Given an operation that immediately commutes with itself,
can we implement it so that it performs only local computation,
for which the time is assumed to be negligible compared to the message 
delay in
the communication network?

We demonstrate three algebraic properties allowing
an operation or group of operations to be made fast.
Making a single operation from an abstract data type fast has proved to
be a nontrivial task.  Although we have not quite developed an exact
characterization of when a single operation can be made fast, we
have determined a property (self-obliviousness) 
that is sufficiently general
in allowing a single operation to be made fast.  
Any operation that immediately does not commute with
itself cannot be made fast.  If an operation is
noninterleavable with 
respect to a pair of operations, then some slowdown must
occur (either the individual operation or the pair).  A lack of
self-obliviousness is related to both of these properties.  If an operation
immediately does not commute with itself, then it is not self-oblivious.  If
an operation is not self-oblivious, then it may be
noninterleavable with respect to another operation and itself.

We now define more properties of generic operations.  Although the
properties are noted in the object-oriented programming literature, we
are not aware of any formal definitions like ours.  Let $\alpha$ and
$\beta$ be arbitrary operation sequences.  If the legality of
$\alpha \circ \aop \circ \beta$ implies the legality of $\alpha \circ \beta$
for any instance $\aop$ of generic operation $\AOP$ and if the legality of
$\alpha \circ \beta$ implies the legality of $\alpha \circ \aop^* \circ \beta$
for some instance $\aop$ of generic operation $\AOP$ 
(where $\op^*$ denotes $0$
or more copies of $\op$), then $\AOP$ is an {\em accessor}.  Informally,
an accessor does not change an object's state.  $\FIND$ for reference-count sets
(Table~\ref{bst-2-rel}) is an accessor.
$\MOP$ is a {\em mutator\/}
if there exist operation
sequences $\alpha$ and $\beta$ such that $\alpha \circ \mop \circ \beta$ is
legal but $\alpha \circ \beta$ is not legal for some instance $\mop$ of $\MOP$.
Informally, a mutator
changes an object's state, and this change can be detected.  $\INS$ and $\UP$ for
reference-count sets are mutators.

For any abstract data type, we can always implement accessor operations
such that they only perform local computation.  Other operations take
time $d$ in the implementation.
\begin{theorem}
\label{fast-pure-acc}
Let $T$ be an abstract data type.  If $T$ has generic accessor operations
$\AOP_1, \ldots, \AOP_n$, then there exists a linearizable implementation
of objects of type $T$ where
$|\AOP_1| = \cdots = |\AOP_n| = 0$ and $|\MOP_1| = \cdots = |\MOP_m| = d$ for
all other generic operations $\MOP_1,\ldots,\MOP_m$.
\end{theorem}

\begin{proof}
We exhibit an implementation where
$|\AOP_1| = \cdots = |\AOP_n| = 0$ and
$|\MOP_1| = \cdots = |\MOP_m| = d$.  Each process
keeps a copy of all objects in its local memory.  When an $\aop_i$ is invoked
at process $p$, $p$ performs the operation locally and returns 
the result from
the operation.  When a $\mop_j$ operation on object $X$ is invoked at process
$p$, $p$ sends a message $\DoMop_j(X)$ with the argument list for the operation
to all processes (including itself), waits $d$ time, and returns the result of
performing the operation.  When a process receives any form of $\DoMop$ message,
it performs the operation on $X$ in its local memory.  If the message was
sent by that process, it saves the result so that the process can return it.
We can break ties in the following way: $\DoMop_k$ is handled
before $\DoMop_l$ if $k < l$, and we use process identifiers to break any
remaining ties.

Our correctness proof is very similar to
a proof in~\cite{cj99-09-05} for read/write objects.
Let $\rho$ be an admissible execution.  We systematically construct the
desired $\tau$.  Each operation in $\rho$ {\it occurs\/} at the time of its
response.  Let $\tau$ be the sequence of operations in $\rho$ ordered by the
times of occurrence, breaking ties by placing $\mop$ operations before $\aop$
operations, $\aop_k$ before $\aop_l$ if $k < l$, $\mop_k$ before $\mop_l$ if
$k < l$, and using process identifiers to break any remaining ties.  By
construction, $\rho|p = \tau|p$ for all $p$, and $\tau$ preserves the relative
ordering of nonoverlapping operations.

We now must show that $\tau$ is legal or that, for each object $X$, $\tau|X$ is
in the sequential specification of $T$.
Consider an accessor operation.  It returns based on its local state.  Its
local state reflects all changes made by mutators occurring up to the time
of the accessor in $\rho$.  Thus the accessor operation returns a legal value
list in $\tau$.

Consider a mutator operation that returns a value list.  It returns based on
its local state at its response time in $\rho$.  Its local state reflects all
changes
made by mutators occurring up to the time of the mutator.  Let the sequence
of mutators occurring up to the time of the mutator in $\rho$ be $\alpha$.
The mutator is legal after $\alpha$, which is a subsequence of $\tau$.
Any accessors interleaved with $\alpha$ in $\tau$ 
do not affect the legality
of the mutator.  Thus the mutator
operation returns a legal value list in $\tau$.
\end{proof}

We now define a restriction of mutator operations.  If the
legality of arbitrary operation sequence $\alpha$ implies the legality of
$\alpha \circ \mop$ for any instance $\mop$ 
of generic mutator operation $\MOP$,
then $\MOP$ is a {\em pure mutator}.  Informally, the return value
lists of pure mutators do not depend on the states of the objects
on which they are invoked.  A push operation for an unbounded stack is a pure
mutator.

For any abstract data type, we can make pure mutator operations
fast.  Other operations take time $d$ in the
implementation.

\begin{theorem}
\label{fast-pure-mod}
Let $T$ be an abstract data type.  If $T$ has generic pure mutator operations
$\MOP_1, \ldots, \MOP_n$, then there exists a linearizable implementation of
objects of type $T$ where
$|\MOP_1| = \cdots = |\MOP_n| = 0$ and $|\OP_1| = \cdots = 
|\OP_m| = d$ for all
other generic operations $\OP_1,\ldots,\OP_m$.
\end{theorem}

\begin{proof}
We exhibit an implementation where
$|\MOP_1| = \cdots = |\MOP_n| = 0$ and
all other operations ($\OP_1, \ldots, \OP_m$) take time $d$.  Each process
keeps a copy of all objects in its local memory.  When a $\mop_i$ is invoked
at process $p$, $p$ sends a message $\DoMop_i(X)$ 
with the argument list for the
operation to all processes (including itself) and returns immediately.
When a process receives any form of $\DoMop$ message,
it performs the operation on $X$ in its local memory.  When an $\op_j$ 
on object
$X$ is invoked at process $p$, $p$ sends a message $\DoOp_j(X)$ with the
argument list for the operation to all processes (including 
itself), waits $d$
time, and returns the result of performing the operation.  When a process
receives any form of $\DoOp$ 
message, it performs the operation on $X$ in its
local memory.  If the message was sent by that process, it saves the result so
that the process can return it.

We can break ties in the following way:  $\DoMop$ 
messages are handled before
$\DoOp$ messages, $\DoMop_k$ is handled
before $\DoMop_l$ if $k < l$, $\DoOp_k$ is 
handled before $\DoOp_l$ if $k < l$,
and we use process identifiers to break any remaining ties.

The correctness proof is very similar to
a proof in~\cite{cj99-09-05} for read/write objects.
Let $\rho$ be an admissible execution.  We systematically construct the
desired $\tau$.  Each operation in $\rho$ {\it occurs\/} 
time $d$ after the time
of its call.  Let $\tau$ be the sequence of operations in $\rho$ 
ordered by the
times of occurrence, breaking ties by placing $\mop$ operations before $\op$
operations, $\mop_k$ before $\mop_l$ if $k < l$, $\op_k$ before $\op_l$ if
$k < l$, and by using process identifiers to break any remaining ties.  By
construction, $\rho|p = \tau|p$ for all $p$, and $\tau$ preserves the relative
ordering of nonoverlapping operations.

We now must show that $\tau$ is legal or that, for each object $X$, $\tau|X$ is
in the sequential specification of $T$.
All $\mop$ operations are legal because they are pure mutators.
Consider an $\op_k$ that returns a value list.  It returns based on its local
state at its response time in $\rho$.  Its local state reflects all changes
made by
operations occurring up to the time of the $\op_k$ in $\rho$.  Thus 
the $\op_k$
returns a legal value list in $\tau$.
\end{proof}

\begin{corollary}
There exists a linearizable implementation of increment-half objects
(Table~\ref{inc-half}), where $|\READ| = d$, $|\INC| = 0$,
and $|\HALF| = 0$.
\end{corollary}

Accessors and pure mutators can be (separately) optimized.
Can we optimize a self-commuting
operation that 
both accesses and modifies the states of the objects on which it
is invoked?

We must be careful, because two fast
operation instances do not know about each other if they are executed less
than time $d$ apart.
To optimize generic operation $\FOP$, we show it is
sufficient for $\FOP$ to 
be {\em self-oblivious}.  Intuitively, this means that a $\fop$
operation instance does 
not indirectly affect another $\fop$ operation instance.

Let $\alpha_1, \alpha_2, \ldots$ be operation 
sequences.
$\FOP$ is self-oblivious if, whenever $\alpha_1 \circ \fop^1,
\alpha_1 \circ \alpha_2 \circ \fop^2, \ldots,
\alpha_1 \circ \alpha_2 \circ \cdots \circ \alpha_n \circ \fop^n,
\ldots$ are legal, there exists an
instantiation of return values for the operations in $\alpha_i$ for each
$i \ge 1$, creating new
sequences of operations $\alpha_i'$ for each $i \ge 1$, such that
$\alpha_1' \circ \fop^1 \circ \alpha_2' \circ \fop^2
\circ \cdots \circ \alpha_n' \circ \fop^n \cdots$ is legal.  It is
assumed that no $\alpha_i$ contains a $\fop$ instance.
An operation that immediately does not commute with itself is
not self-oblivious, because the $\alpha$'s can be empty, with the
possible exception of $\alpha_1'$.

What kinds of operations are self-oblivious?  Accessors are
self-oblivious because they do not change object state information.  A pure
mutator is self-oblivious because it
is always legal after a legal sequence of operations.
However,
determining whether an arbitrary operation is self-oblivious involves
the semantics for all operations of its type.  An operation
that immediately 
commutes with itself is self-oblivious if all other operations
of its type do not perform conditional updates (whether and how
to perform updates are based on object state information).

For read/write objects, reads and writes are self-oblivious.
Reads are self-oblivious because they are accessors.
We now show how writes satisfy the self-oblivious 
definition.  Each write in
an operation sequence is legal.  If $\alpha_1 \circ \writeit^1,
\alpha_1 \circ \alpha_2 \circ \writeit^2, \ldots,
\alpha_1 \circ \alpha_2 \cdots \alpha_n \circ \writeit^n,
\ldots$ are 
legal, then
$\alpha_1 \circ \writeit^1 \circ \alpha_2' \circ 
\writeit^2 \cdots \alpha_n'
\circ \writeit^n$ is legal, 
where the return value for the reads in $\alpha_i'$
for $i \ge 2$ is the value written by $\writeit^{i-1}$.

For unbounded
queues and
stacks, enqueues and pushes are self-oblivious.  
We now show how enqueues satisfy the self-oblivious 
definition.  The argument
for pushes is analogous.  If $\alpha_1 \circ enqueue^1,
\alpha_1 \circ \alpha_2 \circ enqueue^2, \ldots,
\alpha_1 \circ \alpha_2 \cdots \alpha_n \circ enqueue^n,
\ldots$ are legal, then
$\alpha_1 \circ enqueue^1 \circ \alpha_2' \circ enqueue^2 \cdots \alpha_n'
\circ enqueue^n$ is legal, where the return value for the first dequeue
returning $\perp$ in $\alpha_i$
for $i \ge 2$ is changed in $\alpha_i'$ to the value enqueued by
$enqueue^{i-1}$.

For BANKACCOUNT2 objects,
$\RSUC$ and $\RCUS$ are 
self-oblivious, because each immediately commutes with
itself and  does not perform conditional updates.  We now show how $\RSUC$
satisfies the self-oblivious definition.  The argument for 
$\RCUS$ is analogous.
An $\RSUC$ operation in an 
operation sequence is legal if the value returned is
the value written by the last $\RCUS$ preceding the $\RSUC$ in the 
sequence.  If
$\alpha_1 \circ \rcus^1,
\alpha_1 \circ \alpha_2 \circ \rcus^2$, $\ldots,
\alpha_1 \circ \alpha_2 \cdots \alpha_n \circ \rcus^n,
\ldots$ are legal, then
$\alpha_1 \circ \rcus^1 \circ \alpha_2' \circ \rcus^2 \cdots \alpha_n'
\circ \rcus^n$ is legal, where the return value for the $\RSUC$ operations
in $\alpha_i'$ for $i \ge 2$ is the value written by $\rcus^{i-1}$.

\begin{theorem}
\label{selfopthm}
Let $T$ be an abstract data type with a self-oblivious generic operation
$\SELFOP$.
There exists a linearizable implementation of objects of type $T$ where
$|\SELFOP| = 0$ and $|\OP| = 2d$ for all other generic operations $\OP$.
\end{theorem}

We instantiate the algorithm in Figure~\ref{opt-fig} 
\auquery{In Figure~\ref{opt-fig}, Is everything italic that should be?
Also, check capitalization and punctuation.  Put periods at ends of 
sentences?}
to yield our
implementation of objects of type $T$.  Each assignment
statement is executed locally.  Each process keeps both an actual copy and a
scratch copy of each object.
In addition, each process maintains an ordered set of message slots, where
each slot is indexed by a time.  A slot holds messages that are either sent
or received at a given time.

{\footnotesize
\begin{figure}%[tbh]
${\it fastop}({\it args})[X,p]$: \\
$\scratch_X := X$ \\
Handle all unhandled message in slots up to (current time) \\
about $X$ in message slot order, \\
updating $\scratch_X$ as necessary \\
Determine return value based on $\scratch_X$\\
Send handle-fastop$(X,{\it args},p)$ to all processes \\
${\it fastopret}({\it return value})[X,p]$ \\

${\it otherop}({\it args})[X,p]$: \\
send handle-otherop$(X,{\it args},p)$ to all processes \\

receive a message: \\
if it is a handle-fastop, then \\
\null\quad Insert in slot ($\mbox{current time} - d$), tiebreaking after
other \\
\null\quad message types and then by process id \\
else /* handle-otherop message */ \\
\null\quad Insert in slot (current time) 
with a fixed tiebreaking \\
\null\quad order among message types
and then by process id \\
\null\quad timerset($p,d$) \\

when timer goes off: \\
for each unhandled message in slots up to
($\mbox{current time} - d$), \\
considered in message slot order, do: \\
\null\quad Use it to update the actual copy of its object \\
\null\quad if it is not a handle-fastop message, then \\
\null\quad /* it is of the form handle-otherop$(X,{\it args},q)$ */ \\
\null\qquad if $p = q$, then \\
\null\qquad\quad Determine return value based on $X$ \\
\null\qquad\quad ${\it otheropret}({\it return~value})[X,p]$ \\
\null\quad Mark message as handled
\caption{Algorithm for optimizing one operation: Code for 
process $p$\label{opt-fig}}
\end{figure}%9
}

When a $\selfop$ operation is invoked, the
invoking process sends a message about the operation to all other processes,
determines the 
return value list (possibly by updating its scratch copy of the
object based on messages it has received), and returns.  When an instance of
another generic operation is invoked, the invoking process sends a message
about the operation to all other processes.  

Messages about $\selfop$
have a higher priority than messages about other operations.
If a message about a
self-oblivious operation is received at time $t$, then it is placed in slot
$t-d$.  If any other message 
is received at time $t$, then it is placed in slot
$t$.  

If a received message is not about a $\selfop$
operation, then a timer for $d$ later is set.  When a timer goes off, all
messages are handled that are 
in slots indexed by time up to $d$ before the current time, 
updating actual copies of objects 
as necessary.  At this time, if the
process has a pending operation for which its message has been handled, the
process completes its pending operation.  The timers and slots are also used
to give $\selfop$ higher 
priority than other operations because of its need to
complete quickly.

\begin{proof}
We must prove that the implementation guarantees
linearizable executions.
Let $\rho$ be an admissible execution.  To form
$\tau$, we place all operations 
in order according to their message slots and, if their message slots
contain multiple messages,
according to their positions in the message slots.

First, we show that $\tau$ preserves the relative ordering of nonoverlapping
operations in $\rho$.  Suppose that $\op_1$ precedes $\op_2$ in $\rho$; that
is, $t_1$, the finishing time for $\op_1$, is less than $t_2$, the starting
time for $\op_2$.  We
must analyze the ordering based on the classification of $\op_1$ and $\op_2$.
Because $\op_1$ may or may not be a $\selfop$ operation 
and $\op_2$ may or may not be
a $\selfop$ operation, we have four cases to check:
\begin{itemize}
\item[a.] $\op_1$ and $\op_2$ are $\selfop$ operations.  The starting time
for $\op_1$ is $t_1$.  $\op_1$'s message slot
is $(t_1 + d) - d = t_1$, and 
$\op_2$'s message slot is $(t_2 + d) - d = t_2$.
We have $t_1 < t_2$, and thus
$\op_1$ is correctly placed before $\op_2$ in $\tau$.
\item[b.] $\op_1$ is a $\selfop$ operation, but $\op_2$ is not a $\selfop$
operation.  As before, $\op_1$'s message slot is $t_1$, and $\op_2$'s message
slot is $t_2 + d$.  We have  $t_1 < t_2$,  
and so $t_1 < t_2 + d$.  Thus, $\op_1$ is
correctly placed before $\op_2$ in $\tau$.
\item[c.] $\op_1$ are $\op_2$ are not $\selfop$ 
operations.  The starting time
for $\op_1$ is $t_1 - 2d$.  $\op_1$'s message slot is
$(t_1 - 2d) + d = t_1 - d$, and  $\op_2$'
s message slot is $t_2 + d$.  We have
$t_1 < t_2$, and so $t_1 - d < t_2 + d$.  Thus, $\op_1$ is 
correctly placed before $\op_2$ in $\tau$.
\item[d.] $\op_1$ is not a $\selfop$ operation, but $\op_2$ is.
As before, $\op_1$'s message slot is $t_1 - d$, and $\op_2$'s message
slot is $(t_2 + d) - d = t_2$.  We have $t_1 < t_2$, 
and so $t_1 - d < t_2$.  Thus,
$\op_1$ is correctly placed before $\op_2$ in $\tau$.
\end{itemize}

We now prove the legality of
$\tau = \op_1 \circ \op_2 \circ \cdots \circ
op_n \cdots$ by induction.
We must first show that $op_1$ is legal.  Suppose $\op_1$ was performed on
object $X$.  If $\op_1$ is a $\selfop$ operation,
then $\op_1$'s return value is based on the sequence of operations that are
executed on $\scratch_X$ at $\op_1$'s process by the time $\op_1$ is
invoked in $\rho$.  Since $\op_1$ is the first operation placed, it is in
the earliest time slot.  Thus we have an empty sequence of operations that
are
executed on $scratch_X$ at $\op_1$'s process by the time $\op_1$ is invoked
in $\rho$.  $\op_1$ is legal, because the return value is based on the
semantics of $\op_1$'s abstract data type.  If $\op_1$ is not a $\selfop$
operation, then $\op_1$'s 
return value is based on the sequence of operations
that are executed at $\op_1$'s process in $\rho$ by the time $\op_1$
returns in $\rho$.  As before, $\op_1$ is legal.

The induction step comes next.  Suppose 
that $\op_1 \circ \cdots \circ \op_k$
is legal.  We must prove that 
$\op_1 \circ \cdots \circ \op_k \circ \op_{k+1}$
is legal.  We have two cases to consider.  Either $\op_{k+1}$ is a $\selfop$
operation, or it is not.  Suppose that $\op_{k+1}$ is 
performed on object $X$.

Case 1.  If $\op_{k+1}$ is a $\selfop$ operation, then 
its return value is based on the
sequence of operations that are executed on $\scratch_X$ at $\op_{k+1}$'s
process by the time $\op_{k+1}$ is invoked in $\rho$.  We must now analyze
the structure of $\op_1 \circ \cdots \circ \op_k$ to continue.
$\op_1 \circ \cdots \circ \op_k$ has no $\selfop$ operation, or the sequence
has at least one $\selfop$ operation.

Case 1.1.  If $\op_1 \circ \cdots \circ \op_k$ 
has no $\selfop$ operation,
then the message
slots of $\op_1, \ldots, \op_k$ are earlier than or at the same time as
$\op_{k+1}$'s message slot.  Thus, $\op_1, \ldots, \op_k$ have
been performed, or their messages have been received at $\op_{k+1}$'s
process, implying that they have been performed on $\scratch_X$ (if
they use $X$ at all).  $\scratch_X$ reflects the work done by
$\op_1, \ldots, \op_k$.  Thus, $\op_{k+1}$ is legal, because it was performed
based on a legal sequence of operations and the semantics of $\op_{k+1}$'s
abstract data type.

Case 1.2.  Let $\op_l$ 
be the last $\selfop$ operation that starts before
$\op_{k+1}$.  Either $\op_l$ starts more than time $d$ before $\op_{k+1}$ or
$\op_l$ starts less than time $d$ before $\op_{k+1}$.

Case 1.2.1. If $\op_l$ is performed on $X$, then $scratch_X$ reflects
the change to $X$, implying that $\op_{k+1}$ is legal.

Case 1.2.2.
Consider the operations that start less than time $d$ before $\op_{k+1}$
and are placed before $\op_{k+1}$ in $\tau$.  The time slot for any operation
$\op$ that
is not a $\selfop$ is $t_{\op} + d$, where $t_{\op}$ is the starting
time for $\op$.  The time slot for $\op_{k+1}$ is 
$t_{\op_{k+1}}$, its starting
time, since $\op_{k+1}$ is a $\selfop$.  Because 
$t_{\op_{k+1}} - t_{\op} < d$, we have 
$t_{\op_{k+1}} < t_{\op} + d$.  Thus, $\op$ is placed after $\op_{k+1}$ in
$\tau$.
This means that any operation starting less than time $d$ before $\op_{k+1}$
and placed before $\op_{k+1}$ in $\tau$ is a $\selfop$.  In particular, $\op_k$
is a $\selfop$.  Let $\op_f$ be the first $\selfop$ 
starting less than time $d$
before $\op_{k+1}$ and placed before $\op_{k+1}$.  
$\op_f, \ldots, \op_{k+1}$
are all $\selfop$ operations.  By the induction hypothesis,
$\op_1 \cdots \op_k$ is legal.  Let $\alpha_1 = \op_1 \cdots \op_{f-1}$.
$\alpha_1 \circ \op_f$ is legal, 
$\alpha_1 \circ \op_{f+1}$ is legal, $\ldots, \alpha_1 \cdot
\op_k$ is legal, and $\alpha_1 \circ \op_{k+1}$ is legal, 
because the processes
performing $\op_f, \ldots, \op_k, \op_{k+1}$
have received information about all operations in $\alpha_1$.  Each
operation is performed based on a legal sequence of operations and the
semantics of the operation's abstract data type.  We now use the
self-oblivious property.  Let each of
$\alpha_2, \ldots, \alpha_{k-f+2}$ be the empty
sequence.  The largest subscript on an $\alpha_i$ is $k-f+2$, because
we need to place  $\op_f, \ldots, \op_{k+1}$, a total of $k-f+2$
operations.  $\alpha_1 \circ \op_f \circ \alpha_2 \circ \op_{f+1} \cdots
\alpha_{k-f+1} \circ 
\op_k \circ \alpha_{k-f+2} \circ \op_{k+1}$ is legal
by the definition.

Case 2. If $\op_{k+1}$ is not a $\selfop$ 
operation, then its return
value is based on the sequence of operations that execute at
$\op_{k+1}$'s process in $\rho$ by the time $\op_{k+1}$ returns in $\rho$.
This sequence is a prefix of $\tau$, 
because the execution order of operations
is the message slot ordering.  By the induction hypothesis, the sequence is
legal.  Since $\op_{k+1}$
is executed based on a legal sequence of operations and the semantics of
$\op_{k+1}$'s abstract data type, it is legal.
\end{proof}

In our implementation optimizing a self-oblivious operation, all nonoptimized
operations have a worst-case response time of $2d$.  Our implementation
assumes that the optimized operation has cyclic dependences with all other
operations.  Thus, for certain abstract data types, there are 
gaps between our
upper bound and the lower bounds.

\subsubsection{Types having a tight lower bound for all operations}
We now exhibit some abstract data types with implementations
in which the total time for {\em all\/} operations matches the lower bounds
from Subsection~\ref{lb-all}.
Minimizing the total time required for all operations
may help when the frequencies of invoking each operation are approximately
equal.

A pure modify-read (PMR) object is a variable $X$ that can
be read or modified by a pure mutator operation.  This is a generalization
of the pseudo--read-modify-write (PRMW) object,
because the pure mutator operation may be such that
it eventually does not commute with itself.
(A PRMW object is a variable that can be
read, written, or modified by a pure mutator operation that is a commutative
arithmetic operation; see~\cite{cj99-09-03}.)

There exists a linearizable implementation of PMR objects with the
total worst-case response time for all operations matching the lower bound
on the total worst-case response time for all operations.  In the
implementation, the pure mutators take time $0$, and the read operation takes
time $d$.

\begin{theorem}
\label{prmw-sc-lin}
For any set of PMR objects with operations $\READ$ and
pure mutator operations
$\MOP_1,\ldots,\MOP_n$, there exists a
linearizable implementation of the set
with $|\READ| = d$ and
$|\MOP_1| = \cdots = |\MOP_n| = 0$.
\end{theorem}

\begin{proof}
Since all nonread operations are pure mutators, we can instantiate the
implementation described in Theorem~\ref{fast-pure-mod} to yield an
implementation achieving the desired time bounds.
\end{proof}

An $n$-component array object is an array of $n$ items with each item
capable of being read and/or written by some operation.  For example,
$R1W2R3(v_2)(v_1,v_3)$ reads $v_1$ from item $1$, writes $v_2$ to item $2$,
and reads $v_3$ from item $3$.
Thus, 
we combine the call and response events, as described in
Section~\ref{defs}.
Any operation in which the items being read and the
items being written are disjoint sets is easily seen to be immediately
self-commuting, and any operation that is not immediately self-commuting
must read and write the same item.  If $\OP_1$ reads item $i$ and $\OP_2$
writes item $i$, then $\OP_1$ and $\OP_2$ immediately do not commute.

We now exhibit some implementations of component array data types without
$n$-cyclic dependences that match the lower bounds given earlier.

\begin{theorem}
\label{half-all}
For any component array data type $T$ with immediately self-commuting
operations $\OP_1,\ldots,\OP_n$ and
no $k$-cyclic dependences (for each $k \ge 2$), there exists a linearizable
implementation of $T$ such that \(\sum_{i=1}^{n} |\OP_i| = nd/2 \).
\end{theorem}

\begin{proof}
We exhibit an implementation where $|\OP_1| = \cdots = |\OP_n| = d/2$.
We need our assumption of no $k$-cyclic dependences here because of
Theorem~\ref{cd-pair}.  Each
process keeps a copy of all objects in its local memory.  When an $\op_i$ on
object $X$ is invoked at process $p$, $p$ 
sends a message $\DoOp_i(X)$ with the
argument list for the operation to all processes (including itself), waits
$d/2$ time, and returns 
the result of performing the operation.  When a process
receives any form of $\DoOp$ message, it performs the operation on the
appropriate object in its local memory.  We can break ties in the following
way:  $\DoOp_l$ is handled before $\DoOp_m$ if $\op_l$ reads from a component that
$\op_m$ writes.  We 
use process identifiers to break any remaining ties.  The
assumption about no $k$-cyclic dependences always allows ties to be broken.

We now prove that this algorithm is correct.  Let $\rho$ be an admissible
execution.  We 
systematically construct $\tau$.  Each operation that does not
write occurs at its response time.  Every other operation occurs
$d/2$ time after its response time.

Let $\tau$ be the sequence of operations
in $\rho$ ordered by times of occurrence. We break ties by placing $\op_l$
before $\op_m$ if $\op_l$ reads a component that $\op_m$ writes
and by using process
identifiers to break any remaining ties.
By construction, $\rho | p = \tau | p$ for all $p$, and $\tau$ preserves the
relative ordering of nonoverlapping operations.

We must now show that
$\tau = \op_1 \circ \op_2 \circ \cdots$ is legal.  Consider $\op_i$,
which returns based on the local
state of its process $d/2$ time after its invocation time, $t$.  The local
state reflects all changes made by other operations starting by time $t - d/2$.
Any operation starting after time $t - d/2$ does not appear in $\tau$ before
$\op_i$ by construction 
unless it does not write.  Therefore, it does not affect
the legality of $\op_i$.
\end{proof}
 
If $\OP_i$ and $\OP_j$ 
immediately do not commute for all $i \ne j$, then our
upper bound matches the lower bound from Theorem~\ref{clique}.

The following abstract data type satisfies the hypothesis of
Theorem~\ref{half-all}.
Consider a 2-component array object with operations $R1R2$, $W1R2$, and $W2$.
The abstract data type has a $3$-clique as its commutativity graph.  Each
operation is immediately self-commuting.  No cyclic dependences exist.  This
type is similar to BANKACCOUNT2.

We can optimize implementations of certain data
types with 0-$d$ assignments if they have no $n$-cyclic dependences.

\begin{theorem}
\label{vct}
For any component array data type $T$ with immediately self-commuting
operations $\OP_1,\ldots,\OP_n$ and
no $k$-cyclic dependences (for each $k \ge 2$),
there exists a linearizable
implementation of $T$ such that \(\sum_{i=1}^{n} |\OP_i| = |\VC(T)| d \).
\end{theorem}

\begin{proof}
Assume that $\VC(T)=\{\OP_1,\ldots,\OP_{|\VC(T)|}\}$.
We exhibit an implementation where $|\OP_1| = \cdots = |\OP_{|VC(T)|}| = d$ 
and
$|\OP_{|\VC(T)|+1}| = \cdots = |\OP_n| = 0$.  By the definitions of
vertex cover and commutativity graph,
$\OP_i$ and $\OP_j$ immediately
commute for all $i,j$ in $\{|\VC(T)+1|, \ldots, n\}$.

Each
process keeps a copy of all objects in its local memory.  When an $\op_i$ on
object $X$ is invoked at process $p$, $p$ sends a 
message $\DoOp_i(X)$ with the
argument list for the operation to all processes (including itself).  If 
$i > |\VC(T)|$, the process immediately returns the result of the operation
(reading the components that should be read).  If $i \le |\VC(T)|$, the
process waits
$d$ time and then 
returns the result of performing the operation.  When a process
receives any form of $\DoOp$ message, it performs the operation on the
appropriate object in its local memory.  We can break ties in the following
way: $\DoOp_i$ is handled before $\DoOp_j$ if $\OP_i \not \in \VC(T)$ and
$\OP_j \in \VC(T)$.  Otherwise, $\DoOp_i$ is handled before $\DoOp_j$ if
$i < j$.  We use process identifiers to break any remaining ties.

We now prove that this algorithm is correct.  Let $\rho$ be an admissible
execution.  We systematically construct $\tau$.  Each operation occurs
at its response time.
Let $\tau$ be the sequence of operations
in $\rho$ ordered by times of occurrence, breaking ties by placing $\op_k$
before $\op_l$ and using the same method as in message handling.  By construction,
$\rho | p = \tau|p$ for all $p$, and $\tau$ preserves the relative ordering
of nonoverlapping operations.

We must now show that $\tau$ is legal.  We have
$\tau = \op_{1i_1} \circ \op_{2i_2} \circ \cdots$, where $ki_k$ means that 
the $k$th operation in $\tau$ is an $\op_{i_k}$ operation.
In $\op_{ki_k}$, if
$i \le |\VC(T)|$, then the operation returns based on the local state of its
process $d$ time after its invocation time, $t$.  The local
state reflects all changes made by other operations starting by time $t$.
Any operation starting after time $t$ does not appear in $\tau$ before
$\op_{ki_k}$ by construction unless it immediately commutes with $\op_{ki_k}$.
Therefore, it does 
not affect the legality of $\op_{ki_k}$.  If $i > |\VC(T)|$,
then the operation returns based on the local state of its process at its
invocation time, $t$.  The local state reflects all changes made by other
operations starting by time $t-d$.  Any operation starting after time $t-d$
does not appear in $\tau$ before $\op_{ki_k}$ by construction unless it
immediately commutes with $\op_{ki_k}$.  Therefore, it does not affect the
legality of $\op_{ki_k}$.
\end{proof}

If $T$'s commutativity graph has a small vertex cover ($|\VC(T)| < n/2$), then
the implementation described in the proof of Theorem~\ref{vct} is faster than
that in the proof of Theorem~\ref{half-all} with
respect to the total worst-case time complexity for all operations.

The following 
abstract data type satisfies the hypothesis of Theorem~\ref{vct}.
Consider a 3-component array object with operations $W1$, $R2$, $R3$, $R1W2W3$,
and $R1R2W3$.
Figure~\ref{t:vctpic} shows 
the commutativity graph for the abstract data type.
The set $\{R1W2W3, R1R2W3\}$ forms a vertex cover for the graph.
Each operation is immediately self-commuting.  No cyclic dependences exist.
This type is similar to BANKACCOUNT3.

\begin{center}
\begin{figure}[tbh]
\setlength{\unitlength}{0.00083300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(5270,4895)(-11,-4044)
\thicklines
\put(1651,-3661){\circle{750}}
\put(3301,-1861){\circle{750}}
\put(4876,-3661){\circle{750}}
\put(4701,-136){\circle{750}}
\put(1651,-136){\circle{750}}
\put(3001,-2086){\line(-5,-6){1088.115}}
\put(3601,-2086){\line( 5,-6){1057.377}}
\put(4501,-3736){\line(-1, 0){2475}}
\put(1876,-436){\line( 1,-1){1200}}
\put(4726,-511){\line( 0,-1){2850}}
\put(4651,-511){\line(-1,-1){1125}}
\put(3026,-1936){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$r1w2w3$}}}
\put(4651,-3736){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$r1r2w3$}}}
\put(1576,-3736){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$w1$}}}
\put(1576,-211){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$r2$}}}
\put(4651,-211){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$r3$}}}
\end{picture}
\caption{Example commutativity graph for Theorem~\ref{vct}}
\label{t:vctpic}
\end{figure}
\end{center}


Theorems~\ref{half-all} and~\ref{vct} hold for abstract data types
without $n$-cyclic dependences.  However,
some abstract data types do have $n$-cyclic dependences.  Improving the
bounds for implementations of those types is much more complicated than
improving the bounds for implementations of types without $n$-cyclic
dependences.  We show a tight bound for component array data types
with three immediately self-commuting operations and a $3$-cyclic dependence.

\begin{theorem}
\label{2dimpcd}
If $T$ is a component array data type with 
operations $\OP_1$, $\OP_2$, and
$\OP_3$ having a $3$-cyclic dependence,
where $\OP_1$ is 
immediately self-commuting and has no cyclic dependences with
$\OP_2$ and $\OP_3$,
then there exists a linearizable
implementation of $T$ such that \(\sum_{i=1}^{3} |\OP_i| = 2d \).
\end{theorem}

\begin{proof}
We exhibit an implementation where $|\OP_1| = 0$, $|\OP_2| = d$, and 
$|\OP_3| =
d$.  $|\OP_1| + |\OP_2| + |\OP_3| \ge 2d$ by 
Theorem~\ref{3c2d}.  At least one of
$|\OP_1|, |\OP_2|,$ and $|\OP_3|$ must be at least $d$ by
Theorem~\ref{sc-big-cycle}.
The assumption that $\OP_1$ is 
immediately self-commuting is necessary because
of Theorem~\ref{slow-op}.
Each process keeps a copy of all objects in its local memory.  When an
$\op_1$ is invoked at process $p$, $p$ sends a message $\DoOp_1(X)$ with the
argument list for the operation to all processes (including itself) and returns
immediately.  When a process receives any form of $\DoOp_1$ message, it performs
the operation on $X$ in its local memory.  When an $\op_j$ 
($j = 2$ or $3$) on
object $X$ is invoked at process $p$, $p$ sends a message $\DoOp_j(X)$ with the
argument list for the operation to all processes (including itself), waits $d$
time, and returns the result of performing the operation.  When a process
receives any form of $\DoOp_j$ ($j = 2$ or $3$) message, it performs the
operation on $X$ in its local memory.
We can break ties in the
following way:  $\DoOp_i$ messages are handled before 
$\DoOp_j$ messages if $i <
j$, and we use process identifiers to break any remaining ties.

We now prove that this algorithm is correct.
Let $\rho$ be an admissible execution.  We systematically construct the
desired $\tau$.  Each operation in $\rho$ occurs time $d$ after the time
of its call.  Let $\tau$ be the sequence of 
operations in $\rho$ ordered by the
times of occurrence, breaking ties by placing 
$\op_k$ before $\op_l$ if
$k < l$, and using process identifiers to break any remaining ties.  By
construction, $\rho|p = \tau|p$ for all $p$, and $\tau$ preserves the relative
ordering of nonoverlapping operations.

We now must show that $\tau$ is legal, or that for each object $X$, $\tau|X$ is
in the sequential specification of $T$.
Consider an $\op_k$ that returns a value list.  It returns based on the local
state of its process at its response time in $\rho$.  The local state reflects
all changes made by
operations occurring up to the time of the $\op_k$ in $\rho$.  Thus 
the $\op_k$
returns a legal value list in $\tau$.
\end{proof}

The BANKACCOUNT3 abstract data type satisfies the hypothesis of
Theorem~\ref{2dimpcd}.

\subsubsection{Relative costs}
Let us summarize this section's findings. We know
several algebraic properties of operations causing individual
operations
or groups of operations satisfying one of those properties to have a worst-case
response time of $\Omega(d)$.

Let $L_1$ be the list of properties.
We have linearizable implementations of classes of abstract data types
in which the worst-case response time for one operation or a group of
operations is optimized (the worst-case response time is $0$), provided that
the operation or group satisfies one of three algebraic properties.
Let $L_2$ be the list of three properties.
If not satisfying any property in $L_2$ implies satisfying a property in
$L_1$, then we could say that
sequential consistency and linearizability are equally costly.  Unfortunately,
$L_1$ and $L_2$ do not quite satisfy that desired relationship.
However,
if the operation or group of operations to be optimized is known to be
invoked most frequently, then sequential consistency and
linearizability are equally costly for those abstract data types.

In addition, now we have linearizable implementations of
certain classes of abstract data types with the total worst-case completion
time for all operations matching the sequential consistency lower bound for
all operations.  This implies that
sequential consistency and linearizability are equally costly for these
classes.

\section{Imperfect clocks}
\label{approx}
In this section we assume that clocks run at the same rate as real time but
are not initially synchronized, and that message delays are in the range
$[d-u,d]$ for some $u > 0$.
We prove that many operations in linearizable implementations of virtual
shared objects must have worst-case response times that are $\Omega(u)$,
under reasonable assumptions about the amount of sharing of objects by
processes.  We show that for many abstract data types, we can provide
operations with worst-case time complexities of $0$ in sequentially
consistent implementations.  This shows that linearizability can be more
expensive than sequential consistency under our assumptions about process
synchrony.  Subsection~\ref{lin-lb} 
contains our lower bounds on the costs of
operations under linearizability, and
Subsection~\ref{sc-ub} contains 
our sequentially consistent implementations of
classes of abstract data types.

To prove our lower bounds, we use shifting techniques
introduced in~\cite{cj99-09-16} 
to prove lower bounds on the precision achieved by 
clock synchronization algorithms.  These techniques are used
in~\cite{cj99-09-05} to prove $\Omega(u)$ worst-case 
response times for read, write,
and enqueue operations.  Shifting changes the
timing and ordering of system events without changing the local
view of each process.

In an execution with a certain set of clocks, if process $p$'s history is
changed so that the real times at which the events occur are shifted by some
amount $s$ and if $p$'s clock is also shifted by amount $s$, then the result
is another execution in 
which every process ``sees'' the same events happening
at the same real times.  Because its clock has changed by the same amount,
$p$ cannot detect the changes in the real times at
which events occur.

The {\em view\/} of process $p$ in history $\pi$ of $p$ with clock $C$ is the
concatenation of the sequences of steps in $\pi$, arranged in real-time
order.  The view does not contain the real times 
when the steps occurred.  History
$h$ of process $p$ with clock $C$ and history $h'$ of process $p$ with
clock $C'$ are {\em equivalent\/} 
if $p$'s view is the same in both histories.
Execution $\rho$ of the system $(P,\cal C)$ and execution $\rho'$ of the system
$(P,\cal C')$ are equivalent if the component histories for $p$ in $\rho$ and
$\rho'$ are equivalent for all $p$ in $P$.  This means that the processes
cannot tell the difference between the two executions.

Given a history $\pi$ of a process $p$ with clock $C$ and real number $s$, a
new history $\pi' = \shift(\pi,s)$ is defined by $\pi'(t) = \pi(t+s)$ for
all $t$.  This means that all tuples are shifted earlier in $\pi'$ by $s$ if
$s > 0$ and later in $\pi'$ by $-s$ if $s < 0$.  Given a clock $C$ and a real
number $s$, a new clock $C' = 
\shift(C,s)$ is defined by $C'(t) = C(t) + s$ for
all $t$.  This means that the clock is shifted forward by $s$ if $s > 0$ and
backward by $s$ if $s < 0$.
Shifting a history of process $p$ and $p$'s clock by the same amount produces
another history.

\begin{lemma}[\cite{cj99-09-16}]
\label{new-hist}
Let $\pi$ be a history of a 
process $p$ with clock $C$, and let $s$ be a real
number.  Then $\shift(\pi,s)$ is a history of $p$ with clock $\shift(C,s)$.
\end{lemma}

Given an execution $\rho$ of a system $(P,\cal C)$ and a real number $s$, we
define a new execution $\rho' = \shift(\rho,p,s)$ by replacing $\pi$ ($p$'s
history in $\rho$) with $\shift(\pi,s)$ 
and by retaining the same correspondence
between sends and receives of messages.  (We redefine the correspondence so
that a pairing in $\rho$ that involves $p$'s event at time $t$ involves
$p$'s event at time $t-s$ in $\rho'$.)  All tuples for a 
process $p$ are shifted
by $s$, but no other tuples are changed.  

Given a set of clocks 
${\cal C} = \{C_q\}_{q \in P}$ and a 
real number $s$, we define a new set of
clocks ${\cal C'} = \shift({\cal C},p,s)$ by replacing $C_p$ with 
$\shift(C_p,s)$.
Process $p$'s clock is shifted forward by $s$, but no other clocks are changed.
Shifting one process's history and its clock by the same amount in an execution
results in an execution that is equivalent to the original.

\begin{lemma}[\cite{cj99-09-16}]
\label{shift-equiv}
Let $\rho$ be an execution of system $(P,\cal C)$, 
let $p$ be a process, and let $s$
be a real number.  Let ${\cal C'} = \shift({\cal C},p,s)$ and 
$\rho' = \shift(\rho,p,s)$.  
Then $\rho'$ is an execution of $(P,{\cal C'})$, and
$\rho'$ is equivalent to $\rho$.
\end{lemma}

The following lemma tells us the amount of change encountered in 
message delays when an execution
is shifted.  Shifting an admissible execution may produce a nonadmissible
execution.

\begin{lemma}[\cite{cj99-09-16}]
\label{shift}
Let $\rho$ be an execution of the system $(P,\cal C)$, let 
$p$ be a process, and let $s$
be a real number.  Let ${\cal C'} = \shift({\cal C},p,s)$ and 
$\rho' = \shift(\rho,p,s)$.  Suppose $x$ is the delay of message $m$ from
process $q$ to process $r$ in $\rho$.  Then the delay of $m$ in $\rho'$ is
$x$ if $q \neq p$ and $r \neq p$, $x-s$ if $r=p$, and $x+s$ if $q=p$.
\end{lemma}

\subsection{Lower bounds for linearizability}
\label{lin-lb}
We now give two algebraic properties of operations that cause individual
operations to have a worst-case time complexity of 
$\Omega(u)$ in linearizable
implementations of objects of their abstract data types, under reasonable
assumptions about the amount of sharing of objects by processes.

This first result is that an operation must have worst-case
time complexity 
of $\Omega(u)$ if the legality of some sequence depends on the
order in which two instances of the operation are executed.
\begin{theorem}
\label{imp-no-commute}
Let $T$ be an abstract data type with an operation $\OP$
such that
$\rho \circ \op^1 \circ \op^2$ does not look like
$\rho \circ \op^2 \circ \op^1$ for some operation sequence $\rho$ and some
operation instances $\op^1$ and $\op^2$.
For any linearizable implementation of an object of type $T$ in a system
with at least
three different processes, $|\OP| \ge u/2$.
\end{theorem}

\begin{proof}
We generalize the proofs in~\cite{cj99-09-05}
that a write for a read/write object and an enqueue for
a queue must take time at least $u/2$.
Since $\rho \circ \op^1 \circ \op^2$ does not look like
$\rho \circ \op^2 \circ \op^1$, there exists an operation sequence $\gamma$
such that $\rho \circ \op^1 \circ \op^2 \circ \gamma$ is legal but
$\rho \circ \op^2 \circ \op^1 \circ \gamma$ is not legal.

Let $A$ be an object of $T$. 
Let $p_1$ and $p_2$ be
two processes that modify $A$, and let $p_3$ be a process that performs
operations on $A$.
Assume in contradiction that there is a linearizable implementation of $A$ for
which $|\OP| < u/2$.
By the specification of $A$ and Lemma~\ref{axone}, there is an
admissible execution $\alpha$ such that the following hold:

\begin{itemize}

\item $\ops(\alpha)$ is $\rho[A,p_3] \circ \op^1[A,p_1] \cdot
\op^2[A,p_2] \circ \gamma[A,p_3]$.

\item $\rho[A,p_3]$ starts at time $0$ and ends at time $t$, $\op^1[A,p_1]$
starts at time $t$, $\op^2[A,p_2]$
starts at time $t+u/2$, and $\gamma[A,p_3]$ starts at time $t+u$.

\item The message delays in $\alpha$ are $d$ from $p_1$ to $p_2$, $d-u$ from
$p_2$ to $p_1$, and $d-u/2$ for all other ordered pairs of processes.

\end{itemize}

Let $\beta = \shift(\shift(\alpha,p_1,-u/2),p_2,u/2)$ 
(shift $p_1$ later by $u/2$
and $p_2$ earlier by $u/2$).  We have admissible $\beta$ because, 
by Lemma~\ref{shift},
the delay of a message from $p_1$ or to $p_2$ is $d-u$, 
the delay of a message
from $p_2$ or to $p_1$ is $d$, and all other delays are unchanged.  But the
linearization of $\ops(\beta)$ is 
$\rho[A,p_3] \circ \op^2[A,p_2] \circ \op^1[A,p_1] \circ \gamma[A,p_3]$,
which is not legal.
\end{proof}

\begin{center}
\begin{figure}%[tbh]
\setlength{\unitlength}{0.00063300in}%
\begingroup\makeatletter\ifx\SetFigFont\undefined
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y   % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
  \ifnum #1<17\tiny\else \ifnum #1<20\small\else
  \ifnum #1<24\normalsize\else \ifnum #1<29\large\else
  \ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
     \huge\fi\fi\fi\fi\fi\fi
  \csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
  \count@#1\relax \ifnum 25<\count@\count@25\fi
  \def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
  \expandafter\x
    \csname \romannumeral\the\count@ pt\expandafter\endcsname
    \csname @\romannumeral\the\count@ pt\endcsname
  \csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(7212,7785)(301,-7063)
\thicklines
\put(1501,-3886){\line( 1, 0){2100}}
\put(1501,-3736){\line( 0,-1){300}}
\put(3601,-3736){\line( 0,-1){300}}
\put(6001,-3886){\line( 1, 0){1500}}
\put(1501,-5986){\vector( 1, 0){6000}}
\put(1501,-5836){\line( 0,-1){300}}
\put(3601,-5836){\line( 0,-1){300}}
\put(4801,-5836){\line( 0,-1){300}}
\put(6001,-5836){\line( 0,-1){300}}
\put(6001,-3736){\line( 0,-1){300}}
\put(1501,239){\line( 1, 0){2100}}
\put(1501,389){\line( 0,-1){300}}
\put(4801,-361){\line( 0,-1){300}}
\put(4576,-1111){\line( 0,-1){300}}
\put(6001,239){\line( 1, 0){1500}}
\put(4801,-511){\line( 1, 0){975}}
\put(3601,-1111){\line( 0,-1){300}}
\put(3601,-1261){\line( 1, 0){975}}
\put(1501,-1861){\vector( 1, 0){6000}}
\put(1501,-1711){\line( 0,-1){300}}
\put(3601,-1711){\line( 0,-1){300}}
\put(4801,-1711){\line( 0,-1){300}}
\put(6001,-1711){\line( 0,-1){300}}
\put(6001,389){\line( 0,-1){300}}
\put(5776,-361){\line( 0,-1){300}}
\put(5776,-5236){\line( 0,-1){300}}
\put(4801,-5236){\line( 0,-1){300}}
\put(4801,-5386){\line( 1, 0){975}}
\put(3601,-4486){\line( 0,-1){300}}
\put(3601,-4636){\line( 1, 0){975}}
\put(4576,-4486){\line( 0,-1){300}}
\put(3601,389){\line( 0,-1){300}}
\put(7501,-3736){\line( 0,-1){300}}
\put(7501,389){\line( 0,-1){300}}
\put(301,-3961){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$p_3$}}}
\put(301,-4711){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$p_2$}}}
\put(301,-5461){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$p_1$}}}
\put(2401,-3811){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\rho$}}}
\put(301,-3511){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Process}}}
\put(301,-6061){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(1501,-6361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$0$}}}
\put(3601,-6361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t$}}}
\put(4651,-6361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t+u/2$}}}
\put(5851,-6361){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t+u$}}}
\put(4201,-7036){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution $\beta$}}}
\put(301,164){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$p_3$}}}
\put(301,-586){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$p_2$}}}
\put(301,-1336){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$p_1$}}}
\put(2401,314){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\rho$}}}
\put(301,614){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Process}}}
\put(301,-1936){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Time}}}
\put(5101,-436){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$op^2$}}}
\put(3901,-1186){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$op^1$}}}
\put(1501,-2236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$0$}}}
\put(3601,-2236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t$}}}
\put(4651,-2236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t+u/2$}}}
\put(5851,-2236){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$t+u$}}}
\put(4201,-2911){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}Execution $\alpha$}}}
\put(5101,-5311){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$op^1$}}}
\put(3901,-4561){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$op^2$}}}
\put(6451,314){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\gamma$}}}
\put(6451,-3811){\makebox(0,0)[lb]{\smash{\SetFigFont{10}{14.4}{rm}$\gamma$}}}
\end{picture}
\caption{Counterexample for 
Theorem~\ref{imp-no-commute}\label{newfig}}
\end{figure}
\end{center}

A variant of Theorem~\ref{imp-no-commute} can be obtained if the operation
instances in the statement of the theorem 
are not in the
same generic operation.  
See Figure~\ref{newfig}.
The conclusion is that at least one of the generic
operations must take time at least $u/2$.  We omit the proof because of its
similarity to the proof of Theorem~\ref{imp-no-commute}.

\begin{theorem}
Let $T$ be an abstract data type with operations $\OP_1$ and $\OP_2$
such that
$\rho \circ \op_1 \circ \op_2$ does not look like
$\rho \circ \op_2 \circ \op_1$ for some operation sequence $\rho$ and some
operation insta