%                                                            lmmsys.dem
% LaTeX package LJour2 1.0: demonstration file
% (c) Springer-Verlag HD
%----------------------------------------------------------------------
% This is an article of Multimedia Systems
%
\documentstyle[fleqn]{cljour2}
\mathindent=0pt
\begin{document}
\def\While{\hbox{\bf while }}
\def\Begin{\hbox{\bf begin }}
\def\End{\hbox{\bf end;}}
\def\Endwhile{\hbox{\bf endwhile;}}
\def\Endfor{\hbox{\bf endfor;}}
\def\Endforall{\hbox{\bf endforall;}}
\def\Endif{\hbox{\bf endif;}}
\def\If{\hbox{\bf if }}
\def\Then{\hbox{\bf then }}
\def\Else{\hbox{\bf else }}
\def\To{\hbox{\bf to }}
\def\Do{\hbox{\bf do }}
\def\Dopar{\hbox{\bf do\_in\_parallel}}
\def\Step{\hbox{\bf step }}
\def\rowm{\hbox{\bf Row\_mode;}}
\def\colm{\hbox{\bf Column\_mode;}}

\def\For{\bf for }
\def\Forall{\bf forall}
\def\la{\langle}
\def\ra{\rangle}
\journalname{Multimedia Systems}

\title{Intermediate-level vision tasks on a memory array architecture}
\author{ Poras T. Balsara\inst{1}\and Mary Jane Irwin\inst{2}}
%
\mail{Poras T. Balsara}
%
\institute{Erik Jonsson School of Engineering \& Computer Science,
University of Texas at Dallas,\\
P.O. Box 830688 MP33, Richardson, TX 75083, USA
\and
Department of Computer Science, Pennsylvania State University,
University Park, PA 16802, USA}
%
\date{Received January 30, 1992}

\maketitle

%
\begin{abstract}
With the fast advances in the area of computer vision and robotics there
is a growing need for machines that can ``understand images'' at very
high speed. A conventional von Neumann computer is not suitable for this
purpose, because it takes a tremendous amount of time to solve most
typical image analysis problems. Thus, it is now imperative to study
computer vision in a parallel processing framework in order to reduce
the processing time. In this paper we demonstrate the applicability of a
simple memory array architecture to some intermediate-level computer
vision tasks. This architecture, called the {\it Access Constrained
Memory Array Architecture} (ACMAA) has a linear array of processors
which concurrently access distinct rows or columns of an array of memory
modules. Because of its efficient local and global communication
capabilities ACMAA is well suited to low-level as well as
intermediate-level vision tasks. This paper presents algorithms for
connected component labeling, determination of area, perimeter and
moments of a labeled region, convex hull of a region, and Hough
transform of an image. ACMAA is well suited to an efficient hardware
implementation because it has a modular structure, simple interconnect
and limited global control.
\end{abstract}
%
\keywords{Memory array architecture -- Computer vision tasks --
Algorithms}

\strich

\def\lc{\hbox{$\lceil$}}
\def\rc{\hbox{$\rceil$}}
\def\lf{\hbox{$\lfloor$}}
\def\rf{\hbox{$\rfloor$}}

\section{Introduction}
Over the last decade, the field of computer vision has evolved very
rapidly, mainly because of the growing need in industrial automation,
robotics and various other fields. Such vision systems need to analyze
and understand images at very high speed. In reality, even the very
modest image analysis tasks often require some hundreds of local
neighborhood operations, thus requiring very high computation rates.
This has motivated researchers to explore architectures other than the
conventional von Neumann machine, which requires an excessive amount of
time for sequentially processing of the input image data.

Exploiting the inherent parallelism present in various vision tasks can
significantly reduce the processing time. Fortunately, with the advent
of VLSI technology, hardware has become cheaper and parallelism has
become increasingly affordable. This has led to several proposals and
designs for parallel architectures for image processing and computer
vision (Fountain et al. 1988; Ibrahim 1984; Hwang and Fu 1983; Danielson
and Levialdi 1981). Construction of a parallel image processing
architecture is now viable because it is possible to embed a number of
processors and memory elements within a single chip in a cost-effective
manner.

This paper presents the results of a study done to demonstrate the
usefulness of a memory array architecture with constrained access for
intermediate-level computer vision tasks. This architecture was recently
proposed by Scherson and Ma (1987, 1989) and Hwang et al. (Hwang et al.
1989), who also described its use for problem solving in numerical
analysis, signal processing and sorting. The architecture is based on
the concept of having an orthogonally accessed memory, which was earlier
proposed and used by Buehrer et al. in the EMPRESS: a dynamically
configurable multiprocessor system (Buehrer et al. 1982). In this system
a memory array was used as an intercommunication system to pass
information among processors.

An architecture for a computer vision system should have processing
capabilities for a wide range of tasks -- it should be able to perform
simple, neighborhood operations on the one hand and complex, global
operations on the other. The memory array architecture, because of its
efficient local and global communication capabilities, is well suited to
a wide range of vision tasks.

In this paper we describe ACMAA algorithms for histogramming, connected
component labeling, area, perimeter and moments of a region, convex hull
of a region and Hough transform for extracting straight lines. A
detailed description of several low-level vision algorithms on ACMAA is
given elswhere (Balsara and Irwin 1991). In order to spare the reader
from implementation details we shall assume that all algorithms in the
following sections are designed for a memory array with one pixel per
memory module. However, as described in Sect.\ts 10, they can be easily
modified for the case when there is more than one pixel per memory
module. The algorithms described in this paper have been tested using a
functional simulator for ACMAA (Kennedy 1989). This simulator was not
detailed enough to provide timing and memory access information;
however, it enabled us to verify the correctness of our algorithms.

In the following section we describe the organization of ACMAA and some
elementary operations. Sect.\ts 3 through 9 describe the details of a
few intermediate-level vision algorithms. In Sect.\ts 10 we discuss some
features of ACMAA and these algorithms. Finally, in Sect.\ts 11 we
provide concluding remarks.

\section{Access constrained memory array architecture}
The organization of ACMAA of size $n$ is depicted in Fig.\ts 1. It
consists of a linear array of $n$ processors, (${P_i}, 0 \leq i \leq
n-1$) and an $n \times n$ array of memory modules, (${M_{ij}}, 0 \leq
i,j \leq n-1$). In order to make ACMAA applicable to a wide range of
vision problems, each processor should have relatively more computation
power than the simple, special-purpose processors designed for specific
massively parallel computations (e.g. large $n \times n$ meshes, which
are ideal for nearest neighbor computations). Typically, each ACMAA
processor has a number of registers used to store local variables and is
capable of performing usual Boolean (e.g., not, and, or, xor,
logical-shift, etc.) and arithmetic operations (e.g., add, subtract,
complement, arithmetic-shift, hardware multiply, etc.). Each memory
module has a constant number of words. All the memory modules in a row
$i$ are connected to the row bus $RB_i$, and the modules in a column $i$
are connected to the column bus $CB_i$. Each set of buses is dedicated
to one processor, and there is hence no time sharing of the buses by the
processors. Each processor can access a row or a column of memory. A bus
controller logic takes care of switching the processors between row and
column buses.


\begin{figure}
\vspace{8cm}
\caption[]{Organization of the ACMAA}
\end{figure}


\subsection{Memory access modes}
As mentioned before, a processor $P_i$ can access either memory row $i$
or memory column $i$ at any given time. These access modes are selected
by means of a switch $SW_i$ attached to a processor $P_i$.
\begin{description}[--]
\item[--]{\it Row access mode.} Each processor $P_i$ can access any
memory module $j$ in the $i^{th}$ row memory only, i.e., $P_i$ can
access $M_{ij}$ for $0 \leq j \leq (n-1)$.
\item[--] {\it Column access mode.} Each processor $P_i$ can access any
memory module $j$ in the $i^{th}$ column memory only, i.e., $P_i$ can
access $M_{ji}$ for $0 \leq j \leq (n-1)$.
\item[--] {\it Broadcast mode.} Each processor $P_i$ can write a single
data value at the same location of all the memory modules in its row or
column, depending upon the state of the bus controller. This mode is
useful for global initialization operations on the memory array.
\end{description}
\medskip
The first two access modes are mutually exclusive. In our constrained
model all processors operate in the same access mode and they run the
same set of instructions in a lock-step fashion. Hence, it can be
classified as a {\it Single Instruction Multiple Data} (SIMD)
architecture. These access constraints remove memory access conflicts
and simplify the memory control. However, this architecture could also
be classified as {\it Multiple Instruction Multiple Data} (MIMD) with
partially shared memory as mentioned by Hwang et al. (1989) if the
processors are allowed to operate independently. We use this
architecture in the SIMD mode, enabling us to design algorithms with
simple conflict-free memory control.

In the existing literature this architecture is also referred to as {\it
Orthogonal Multiprocessor} (Hwang et al. 1989), or {\it Orthogonal
Memory-Access Multiprocessor} (Scherson and Ma 1987; Scherson and Ma
1989). However, we call it {\it Access Constrained Memory Array
Architecture} (ACMAA) because it brings forth the main feature of simple
control through the constraints on the orthogonal access of the memory
array.


Using the above-mentioned memory access modes a processor can
communicate with another processor in two steps (constant time). This is
done as follows: a processor $P_i$ communicates with another processor
$P_j$ by first putting a message in the memory module $M_{ij}$ while
operating in the {\it row mode}. Then processor $P_j$ can read this
message from $M_{ij}$ by operating in the {\it column mode}. In a
similar way every processor can send a message to every other processor
simultaneously or one processor can broadcast a message to all other
processors using the {\it broadcast mode} in two time steps or $O(1)$
time. A detailed analysis of the communication overhead introduced by
the ACMAA network is presented by Scherson and Ma 1989.


\subsection{Suitability of ACMAA for computer vision}
ACMAA has many interesting features which make it highly suited to a
wide range of tasks in computer vision. These features are mentioned
below.
\begin{description}[--]
\item[--] {\it Simple structure.} The array structure and simple
interconnection scheme of ACMAA are favorable characteristics for
efficient hardware implementation. However, because of its simple
bus-oriented interconnect structure it may not be practical to construct
the ACMAA with a large number of processors because of the performance
degradation due to long buses. In that case, each memory module will
have more than one pixel and each processor will be responsible for a
band of pixels rather than a one bit strip as described in Sect.\ts 10.

\item[--] {\it Simple control.} Constrained memory access leads to
simpler control than in other shared-memory architectures. A bus control
logic associated with each processor enables it to access its set of row
or column modules by selecting the corresponding memory modules using a
memory chip select line.
\item[--] {\it Economical.} Most of the image processing architectures
proposed so far have one processor per pixel design; hence, they are
quite expensive to construct for any reasonable size gray level images.
For an image of size $N \times N$ we need at most $N$ processors and $N
\times N$ size memory array to construct the ACMAA. The $N \times N$
size memory array is much cheaper and smaller to construct compared with
the same size processor array. Since in any access mode only one bus
accesses a memory module, it should be possible to use standard
single-port memory circuits for the memory array.
\item[--] {\it Mapping of images.} An $N \times N$ digitized image maps
directly onto the memory array; there is no restructuring or
reformatting of the input data. In this respect the ACMAA has the
advantage of the mesh organization over tree or pipelined architectures.
Furthermore, this architecture can be easily expanded to have a stack of
memory arrays instead of just one as described above, in order to store
multiple images or frames in such a way that each array stores one
image. This is very useful in dynamic scene analysis, and in motion
analysis and detection.
\item {--} {\it Efficient communications.} ACMAA provides very efficient
local and global communications. Efficient local communication is useful
in low-level vision tasks. Global communication through inter-processor
message transfer or by a broadcast makes it well suited to
intermediate-level and high-level vision tasks.
\end{description}

\subsection{Terminology}
For the algorithms described in this paper we use the notation described
below:
\begin{description}[Column\_mode\hfill]
\item[$M_{ij}$\hfill] memory module in row $i$ and column $j$.
\item[$(A)^{[i,j]}$\hfill] variable $A$ stored in memory module
$M_{ij}$.
\item[$(A)^{[i,*]}$\hfill] all variables $A$ stored in $i^{th}$ row
memory modules.
\item[$(A)^{[*,j]}$\hfill] all variables $A$ stored in $j^{th}$
column memory modules.
\item[$(A)^{[i,j]}\la k\ra$\hfill] variable $A$ stored in $M_{ij}$ at
location $k$.
\item[$(A)^{[i,j]}\la k,l\ra$\hfill] variable $A$ stored in $M_{ij}$ at
location $(k,l)$.
\item[$A$\hfill]  variable $A$ used as local variable by a\\
 processor.
\item[Column\_mode\hfill]  switch to the column access mode.
\item[Row\_mode\hfill]  switch to the row access mode.
\end{description}

\section{Histogramming}
In this section we provide a brief description of the ACMAA histogram
algorithm of a monochrome image. A detailed description is given
elsewhere (Balsara and Irwin 1991) and it is described here for the sake
of completeness as it is useful for some intermediate-level vision
tasks.

\medskip\noindent
\begin{figure*}
\hbox to \hsize{\hrulefill}
\medskip\noindent
{\bf Algorithm 1:} Image histogram
\medskip\noindent
\Forall\ {\it processors}  $P _i$, $(0  \leq  i  \leq  N-1)$
\Dopar\newline
\Begin\newline
\phantom{\Begin}\rowm\newline
\phantom{\Begin}$(H)^{[i,*]} = 0$;\hskip 0.5in {\rm (* global
initialization in each row  *)}\newline
\phantom{\Begin}\For $j=0$ \To $(N-1)$ \Do \hskip 0.2in {\rm (*
determine row histograms *)}\newline
\phantom{\Begin\For}$g = (X)^{[i,j]}$;\newline
\phantom{\Begin\For}$(H)^{[i,g]} = (H)^{[i,g]} + 1$;\newline
\phantom{\Begin}\Endfor
\smallskip\noindent
\phantom{\Begin}\colm \hskip 0.5in {\rm (* sum up the columns *) }
\newline
\phantom{\Begin}\If $(i \leq G-1)$ \Then \newline
\phantom{\Begin\If}\For $j = 0$ \To $N-2$ \Do \newline
\phantom{\Begin\If\For} $(H)^{[N-1,i]} = (H)^{[N-1,i]} + (H)^{[j,i]}$;
\newline
\phantom{\Begin\If}\Endfor \newline
\phantom{\Begin\If}$(H)^{[i,i]} = (H)^{[N-1,i]}$; \newline
\Endforall\medskip\noindent
\hrule
\end{figure*}

Let the input image $\vec X$ be of size $N \times N$ stored one pixel
per memory module, and let the gray levels be quantized from $0$ to
$(G-1)$. For this algorithm, we assume $G \leq N$, as is the case with
most images used in computer vision. The value of the histogram for a
gray level $g$ is denoted by the contents of location (bucket) $H_g$ of
the vector $\vec H$ stored one bucket per module. In the first part of
our algorithm, each processor $P_i$ computes the image histogram of row
$i$ and stores it in the vector $\vec H$ for that row. Then, in the
second part all processors $P_i$ with $i \leq G-1$ sum up the contents
of all the histogram buckets $H_i$ stored in their column memory and
store the final result in the memory module $M_{ii}$. Details of this
process are described in Algorithm 1 given below. This algorithm has a
time complexity of $O(N)$ because each of its loops runs for $N$
iterations concurrently on all the $N$ processors. A sequential
algorithm determines the histogram in $O(N^2)$ time; hence, the ACMAA
algorithm is optimal for an $N$ processor system.

\section{Connected component labeling}
In most computer vision problems it is desirable to break up an image
into regions, each of which might correspond to different surfaces of
objects or areas in a scene, etc. These regions are then labeled so that
they can be identified and processed individually by some other
algorithms, for example, obtaining geometric properties of regions or
obtaining a connectivity graph of an image. Region labeling or connected
component labeling is one of the fundamental tasks in image analysis
that identifies such regions of an image. The labeling algorithm
processes a two-dimensional image and assigns unique labels to different
regions in the image. These regions or connected components consist of
sets of pixels that are connected to each other and have the same gray
value. The output of this algorithm can be used by many other algorithms
to further analyze the image (Bahard and Brown 1982).

Several sequential and parallel algorithms and architectures have been
proposed to label regions of an input image (Lumia et al. 1983; Nassimi
and Sahni 1980; Weems 1984; Ibrahim 1984; Duff and Fountain 1986; Cypher
et al. 1987). Sequential algorithms based on generating a table of
equivalence classes and using local neighborhood are given in (Lumia et
al. 1983; Rosenfeld and Kak 1982). Although these algorithms have a time
complexity of $O(N^2)$, they involve at least two scans of an image and
require additional data structures and processing time to maintain and
sort out a list of equivalent labels into equivalence classes. For an $
N \times N$ image, Nassimi and Sahni (1980) give an $O(N)$ algorithm on
a size $N^2$ two-dimensional mesh. This algorithm achieves good
asymptotic complexity, however it uses complex data manipulation and
global communication routines. Cypher et al. (1987) give an $O(N)$ mesh
algorithm based on Levialdi's shrinking algorithm. To label a binary
image their algorithm needs $O(\log N)$ bits of memory per pixel besides
the storage required for the pixel value and its label. A
straightforward implementation requires $O(N)$ additional bits of memory
per pixel. Weems (1984) gives an algorithm based on local neighborhood
processing on a two-dimensional mesh of content-addressable processors.
This algorithm takes time proportional to the diameter of a region and
for long thin regions (e.g. serpentine figures or spirals) this diameter
is of the order of the area of the image [$O(N^2)$].

The connected component labeling algorithm described in this section
takes an input image array $\vec X$ of size $N \times N$ and produces an
output array $\vec{label}$ such that $label_{ij}$ corresponds to the
label of pixel $x_{ij}$.

Connectivity of image pixels can be described by either of these two
definitions: 4-connectivity and 8-connectivity. Two pixels are
4-connected if they are adjacent vertically or horizontally (i.e.,
pixels at locations $(i,j)$ and $(i,j\pm 1)$ or $(i,j)$ and $(i\pm
1,j)$). They are said to be 8-connected if they are adjacent
horizontally, vertically or diagonally (i.e., pixels at locations
$(i,j)$ and $(i\pm 1,j\pm 1)$ ). In this paper we shall use the
definition of 4-connectivity to describe our algorithm. However, the
algorithm mentioned below can be easily modified to label 8-connected
pixels also.

The algorithm described in this section is based on the bottom-up,
divide-and-conquer technique. We partition the input image into a number
of subimages and label each subimage. Then, we merge these labeled
subimages together to determine the labels of the whole image. This
merging is done in a bottom-up fashion, starting from small subimages
and going up to the largest subimage, i.e. the input image.

In each merge operation we merge four subimages of size $m \times m$
into one subimage of size $2m \times 2m$. Merging is done in two steps:
(1) merge two subimages of size $m \times m$ horizontally across their
common vertical boundary to get a subimage of size $m \times 2m$; (2)
merge these subimages of size $m \times 2m$ vertically across their
common horizontal boundary to get the final subimage of size $2m \times
2m$.

\begin{figure*}
\medskip\noindent
{\bf Algorithm 2:} Region labeling
\medskip\noindent
\Forall\ {\it processors} $P_i$, $(0  \leq  i  \leq  N-1)$ \Dopar
\newline
\Begin\newline
\phantom{\Begin}\rowm \newline
\phantom{\Begin}\For $j = 0$ \To $N-1$ \Do \hskip 0.5in {\rm (*
initialize pixel labels
 *)}\newline
\phantom{\Begin\For}$(label)^{[i,j]} = i*N + j$;  {\rm(* $label =$
row major index *)}
\smallskip\noindent
\phantom{\Begin}\For $j=0$ \To $\log N-1$ \Do \newline
\phantom{\Begin\For}$vbounds = {N\over 2^{j+1}}$;  {\rm(* no. of
boundaries *)}\newline
\phantom{\Begin\For}$hbounds = {N\over 2^{j+1}}$;  \
\phantom{\Begin\For}$vlength = 2^j$;  {\rm(* boundary length *)}
\newline
\phantom{\Begin \For}$hlength = 2^{j+1}$;
\smallskip\noindent
\phantom{\Begin\For}{\rm(* Merge horizontally across all vertical
boundaries *)} \newline
\phantom{\Begin\For}\rowm \newline
\phantom{\Begin\For}$strip = \lf {i\over vlength} \rf$; \newline
\phantom{\Begin\For}$row = i \; {\rm mod}\; vlength$;  \newline
\phantom{\Begin\For}$vloc = vlength$;  {\rm(* location of a vert.
boundary *)}\newline
\phantom{\Begin\For}\For $k = 0$ \To $vbounds-1$ \Do \newline
\phantom{\Begin\For\If}\For $p = 0$ \To $vlength-1$ \Do \newline
\phantom{\Begin\For\If\If}\If  $(row = p)$ \Then \newline
\phantom{\Begin\For\If\If\If}\If  $((X)^{[i,vloc-1]} =
(X)^{[i,vloc]}) \wedge
 ((label)^{[i,vloc-1]} \neq (label)^{[i,vloc]})$ \Then \newline
\phantom{\Begin\For\If\If\If\If}{\rm(* broadcast
labels to both regions *)} \newline
\phantom{\Begin\For\If\If\If\If}$minl = \min \{ (label)^{[i,vloc-1]},
(label)^{[i,vloc]} \}$;\newline
\phantom{\Begin\For\If\If\If\If}$maxl = \max \{ (label)^{[i,vloc-1]},
(label)^{[i,vloc]} \}$;\newline
\phantom{\Begin\For\If\If\If\If}$(min\_label)^{[i,*]} = minl$; \newline
\phantom{\Begin\For\If\If\If\If}$(max\_label)^{[i,*]} = maxl$; \newline
\phantom{\Begin\For\If\If\If}\Endif \newline
\phantom{\Begin\For\If\If}\Endif \newline
\phantom{\Begin\For\If\If}\colm \quad {\rm(* pick up new labels from
appropriate rows *)}\newline
\phantom{\Begin\For\If\If}$b = strip*vlength + p$; \newline
\phantom{\Begin\For\If\If}$minl = (min\_label)^{[b,i]}$; \newline
\phantom{\Begin\For\If\If}$maxl = (max\_label)^{[b,i]}$; \newline
\phantom{\Begin\For\If\If}\rowm \hskip 0.5in {\rm(* update labels in
both regions *)} \newline
\phantom{\Begin\For\If\If}\For ($b = vloc-vlength$) \To
($vloc+vlength-1$) \Do \newline
\phantom{\Begin\For\If\If\If}\If $((label)^{[i,b]} = maxl)$ \Then
\newline
\phantom{\Begin\For\If\If\If\If}$(label)^{[i,b]} = minl$; \newline
\phantom{\Begin\For\If}\Endfor \newline
\phantom{\Begin\For\If}$vloc = vloc + 2*vlength$; \newline
\phantom{\Begin\For}\Endfor \newline
\newline
\phantom{\Begin\For}{\rm(* Merge vertically across all horizontal
boundaries *)} \newline
\phantom{\Begin\For}\colm \newline
\phantom{\Begin\For}$strip = \lf {i\over hlength} \rf$; \newline
\phantom{\Begin\For}$col = i \; {\rm mod}\; hlength$;  \newline
\phantom{\Begin\For}$hloc = vlength$; \quad {\rm(* location of a
horizontal boundary *)}\newline
\phantom{\Begin\For}\For  $k = 0$ \To $hbounds-1$ \Do \newline
\phantom{\Begin\For\If}\For  $p = 0$ \To $hlength-1$ \Do \newline
\phantom{\Begin\For\If\If}\If  $(col = p)$ \Then \newline
\phantom{\Begin\For\If\If\If}\If $((X)^{[hloc-1,i]} = (X)^{[hloc,i]})
\wedge
 ((label)^{[hloc-1,i]} \neq (label)^{[hloc,i]})$ \Then \newline
\phantom{\Begin\For\If\If\If\If}{\rm(* broadcast labels to both regions
*)} \newline
\phantom{\Begin\For\If\If\If\If}$minl = \min \{ (label)^{[hloc-1,i]},
(label)^{[hloc,i]}
 \}$;\newline
\phantom{\Begin\For\If\If\If\If}$maxl = \max \{ (label)^{[hloc-1,i]},
(label)^{[hloc,i]}
 \}$;\newline
\phantom{\Begin\For\If\If\If\If}$(min\_label)^{[*,i]} = minl$; \newline
\phantom{\Begin\For\If\If\If\If}$(max\_label)^{[*,i]} = maxl$; \newline
\phantom{\Begin\For\If\If\If}\Endif \newline
\phantom{\Begin\For\If\If}\Endif
\medskip\noindent
\end{figure*}
\begin{figure*}
\medskip\noindent
\phantom{\Begin\For\If\If}\rowm \quad {\rm(* pick up new labels from
appropriate columns *)}\newline
\phantom{\Begin\For\If\If}$b = strip*hlength + p$; \newline
\phantom{\Begin\For\If\If}$minl = (min\_label)^{[i,b]}$; \newline
\phantom{\Begin\For\If\If}$maxl = (max\_label)^{[i,b]}$;
\phantom{\Begin\For\If\If}\colm \hskip 0.5in {\rm(* update labels in
both regions *)} \newline
\phantom{\Begin\For\If\If}\For ($b = hloc-vlength$) \To
($hloc+vlength-1$) \Do \newline
\phantom{\Begin\For\If\If\If}\If $((label)^{[b,i]} = maxl)$ \Then
\newline
\phantom{\Begin\For\If\If\If\If}$(label)^{[b,i]} = minl$; \newline
\phantom{\Begin\For\If}\Endfor \newline
\phantom{\Begin\For\If}$hloc = hloc + 2*vlength$; \newline
\phantom{\Begin\For}\Endfor \newline
\phantom{\Begin}\Endfor \newline
\Endforall\medskip\noindent
\end{figure*}

Merging of two subimages involves updating labels in both the subimages,
so that any connected component in these subimages that extends across
the common boundary is labeled by the same label. When we merge two
subimages we sequentially scan the pixels along their common boundary
and at each step we process a different pair of neighboring pixels. If
these pixels have the same gray value then it implies that they belong
to the same region. We then merge the regions corresponding to these
neighboring pixels and assign a label, which is the smaller of the two
pixel labels. This procedure is described in detail in Algorithm 2.

Initially, all the regions are of size $1 \times 1$ (i.e., one pixel
each) and they are assigned a label, which is their index in the row
major ordering of the image array. These subimages of size $1 \times 1$
are merged into larger subimages in parallel by the main loop of
Algorithm 2. In this loop we first merge subimages horizontally and then
vertically. Let us now examine the details of the horizontal merge
operation. In this operation we merge two labeled subimages of size $m
\times m$ into one subimage of size $m \times 2m$. This is done by
scanning the common boundary between the two subimages pixel by pixel.
The variable $vlength$ keeps track of the number of boundary pixel pairs
to be scanned in the present merge operation. When the $i^{th}$ pixel
pair on the boundary is considered, processor $P_i$ is active in the row
access mode. It compares the two pixels and if they have the same value
it updates their labels by the minimum of the two labels. Besides
updating the labels of the pair it also updates all the labels of the
pixels, which were connected to these two pixels. This is done as
follows. Processor $P_i$ broadcasts the smaller and the larger labels
($min\_label$ and $max\_label$) to all the memory modules in row $i$
(see Fig.\ts 2a). Then in the column mode all the processors pick up
these new labels from the appropriate rows of the subimage to which it
belongs. That is, each processor picks up the label from the row $r$ of
the horizontal $strip$ to which it belongs; e.g., in Fig.\ts 2b
processor $P_c$ picks up labels from row $b$ of the array (which
corresponds to row $r$ of $strip$ $x$), whereas processor $P_l$ picks up
from row $j$ of the array, and so on. Finally, we again switch back to
the row mode and all processors scan the rows of the subimage to which
they belong and update the labels. This is shown by horizontal arrows in
Fig.\ts 2c. This merging process is done in parallel for all the
vertical boundaries located at the position given by the variable
$vloc$, and is repeated for all such $vloc$'s in the horizontal merge
operation. The total number of such boundaries is given by variable
$vbounds$.

In the second part of the main loop we merge vertically along the
horizontal boundaries. In this operation we merge two labeled subimages
of size $m \times 2m$ into one subimage of size $2m \times 2m$. This is
done in the same way as the horizontal merge operation except that now
the boundaries are horizontal and they are twice the size of vertical
boundaries.

The running time for Algorithm 2 can be computed by considering the
number of iterations executed by the main loop and the number of steps
performed in each iteration. The total number of steps is then given by,
\begin{equation}
T(N) = \sum_{j=0}^{\log N -1} (T_h + T_v)
\end{equation}
where $T_h$ and $T_v$ are the number of steps required for horizontal
and vertical merge operations.
\begin{equation}
T_h = \sum_{k=0}^{{N\over 2^{j+1}}-1} \sum_{p=0}^{2^j-1} 2^{j+1}
    = N*2^j
\end{equation}
and,
\begin{equation}
T_v = \sum_{k=0}^{{N\over 2^{j+1}}-1} \sum_{p=0}^{2^{j+1}-1} 2^{j+1}
    = 2N*2^j\; .
\end{equation}
Therefore,
\begin{equation}
T(N) = \sum_{j=0}^{\log N -1} 3N*2^j
     = 3*(N^2 - N)
\end{equation}

Hence, the time complexity of the above algorithm is $O(N^2)$ for an
image of size $N \times N$ on ACMAA of size $N$. The algorithm described
above is straightforward and does not require any restructuring or
preprocessing of the input data, nor does it require any complex
communication routines.

\setcounter{figure}{1}
\begin{figure}
\vspace{6.8cm}
\caption[]{Embedding a pyramid on the ACMAA of size 4}
\end{figure}

\section{Area of a region}
The area of a region or a connected component is one of the basic
descriptive properties. In a discrete image it is defined as the total
number of pixels belonging to the region. A tree based, $O(\log N)$ area
algorithm for the NON-VON architecture is described by Ibrahim (1984).
In this section we propose two methods to compute the area of all the
regions in an image. The input to our algorithms is a labeled image
stored in the ACMAA memory array.

\setcounter{figure}{2}
\begin{figure*}
\vspace{9cm}
\caption[]{Horizontal merge operation}
\end{figure*}

\subsection{Summation pyramid approach}
A pyramid is 4-ary tree constructed on an $N \times N$ array of image
pixels. The leaves of this tree (at level 0) are image pixels and each
internal node at level $j$ is connected to to four nodes at level $j-1$
and one node at level $j+1$. The $apex$ (root) of this pyramid is at
level $\log N$. Such a pyramid can be used to perform aggregation
operations on an $N \times N$ image in $O(\log N)$ time. For example, we
can construct a summation pyramid by instructing each internal node to
add up the values received from its four children, so that we get a sum
of $N^2$ numbers at the apex of this pyramid.


We now show how to embed a pyramid on the ACMAA. Fig.\ts 3 describes an
embedding on ACMAA of size $4$. The $N \times N$ array of memory modules
forms the base of a pyramid. Since we have a two-dimensional structure
to simulate a three-dimensional pyramid we shall use some of the modules
to store information pertaining to more than one pyramid levels. When we
merge four nodes (a $2 \times 2$ subarray) at level $j$ into one node at
level $j+1$, we store the information of this higher level parent node
in the left top node of its $2 \times 2$ subarray. This is depicted in
Fig.\ts 3. In Fig.\ts 3b the white nodes are the nodes at level 0 and
the shaded nodes store information for levels 0 and 1. The darkest node
in Fig.\ts 3e corresponds to the apex at level 2 and hence it stores
information of a node at all the three levels. A transfer from level $j$
to level $j+1$ of the pyramid is performed in two ACMAA steps as
follows. In the first step, the two bottom corner nodes of each $2
\times 2$ subarray at level $j$ send their data to the two top corner
nodes using the column mode (Fig.\ts 3a,d). Then, in the second step the
right top corner node of a subarray sends the combined information to
the left top corner node of its subarray, which then becomes a node at
the level $j+1$ (Fig.\ts 3b,e).

We can compute the area of a region using the pyramid embedding as
follows. First, initialize a variable $sum$ to $1$ in all the modules
(pixels) whose label corresponds to that of the region of interest. All
other modules have $sum$ initialized to $0$. Then we construct a
summation pyramid by adding up the variables $sum$ in each module as we
go up the pyramid so that the area of this region will be available in
the memory module $M_{00}$, which corresponds to the apex of this
pyramid. An ACMAA procedure to compute the area of a region labeled $L$
is described in Algorithm 3.

\begin{figure*}
\medskip\noindent
{\bf Algorithm 3:} Area computation on a pyramid
\medskip\noindent
{\rm (* Area computations for an $N \times N$ image on ACMAA of size $N$
*)}\newline
\Forall\ {\it processors} $P _i$, $(0  \leq  i  \leq  N-1)$ \Dopar
\newline
\Begin  \newline
\phantom{\Begin}\colm \newline
\phantom{\Begin}\For $j = 1$ \To $N$ \Do \hskip 0.5in {\rm (*
Initialization *)} \newline
\phantom{\Begin\For}\If  $((label)^{[j,i]} = L)$ \Then \newline
\phantom{\Begin\For\If}$(area)^{[j,i]} = 1$; \newline
\phantom{\Begin\For}\Else $(area)^{[j,i]} = 0$; \newline
\phantom{\Begin}\Endfor \newline
\phantom{\Begin}\For  $k = 1$ \To $\log N$ \Do \newline
\phantom{\Begin\For}\colm \hskip 0.5in {\rm (* Fetch vertically (Fig.\ts
3a and d)
 *)} \newline
\phantom{\Begin\For}\If $((i \; {\rm mod}\; 2^k = 0) \vee (i \; {\rm
mod}\; 2^{k-1} = 0))$ \Then \newline
\phantom{\Begin\For\If}\For  $j=0$ \To $(N-1)$ \Step $2^k$ \Do \newline
\phantom{\Begin\For\If\If}$(area)^{[j,i]} = (area)^{[j,i]} +
(area)^{[j+2^{k-1},i]}$; \newline
\phantom{\Begin\For}\Endif \newline
\phantom{\Begin\For}\rowm \hskip 0.5in {\rm (* Fetch horizontally
(Fig.\ts 3b and e)
 *)} \newline
\phantom{\Begin\For}\If  $(i \; {\rm mod}\; 2^k = 0)$ \Then \newline
\phantom{\Begin\For\If}\For  $j=0$ \To $(N-1)$ \Step $2^k$ \Do \newline
\phantom{\Begin\For\If\If}$(area)^{[i,j]} = (area)^{[i,j]} +
(area)^{[i,j+2^{k-1}]}$; \newline
\phantom{\Begin\For}\Endif \newline
\phantom{\Begin}\Endfor \hskip 0.6in {\rm (* Final result is stored in
module $M_{00}$ *)}\newline
\Endforall
\medskip\noindent
\end{figure*}

The initialization step of Algorithm 3 takes $O(N)$ time. In order to
compute the area of a component labeled $L$ the main loop iterates $\log
N$ times. In an iteration $k$ a processor simulates ${N\over 2^k}$ nodes
belonging to level $k$ of the pyramid. Hence, the total time taken by
the main loop is given by,
\begin{equation}
\sum_{k=1}^{\log N} {N\over 2^k}  =  O(N)\; .
\end{equation}
Hence, if there are $C$ connected components in an image, it take
$O(CN)$ time steps to compute their areas using the summation pyramid
approach. Area computation for a labeled region can be done on a
uniprocessor in $O(N^2)$ time steps, hence the ACMAA algorithm achieves
a linear speed-up using $N$ processors.

\subsection{Split-and-merge approach}
In this section we describe a bottom-up, divide-and-conquer approach
similar to the one described in Sect. 4 to compute the area for all
regions in an $N \times N$ image on ACMAA of size $N$. As before, we
first partition the image into subimages. Then, we compute areas in each
subimage and finally we merge the subimages by adding up the areas of
regions with identical labels extending across their common boundary.

Initially, all the modules have their variable {\it area} initialized to
$1$ because that is the area of one pixel stored in a module. Then in
each successive iteration we merge horizontally and vertically as
mentioned before. However, in the merge step we do the following. We
scan the common boundary between the subimages to be merged looking for
a pair of 4-connected pixels with the same label, say $L$. Once such a
pair is found we combine the areas of this region lying in the two
subimages and update the variable $area$ for all modules with label $L$
in these subimages. This new value of {\it area} is the sum of areas
stored in the pixels of the 4-connected pair. In each merge operation we
update the area of one region only once because all the modules in a
subimage store the area of the component to which they belong and so we
need only one 4-connected pair of the region extending across the common
boundary to combine the areas from the two subimages. In Algorithm 4 we
give a pseudo-code to perform the above mentioned merge operation (other
parts of this algorithm are similar to that of Algorithm 2). The
variable names used in this algorithm correspond to those used in
Algorithm 2.

\begin{figure*}
\medskip\noindent
{\bf Algorithm 4:} Merge operation for area computation
\medskip\noindent
$\ldots$ \newline
$\ldots$ \newline
{\rm(* Merging horizontally across a vertical boundaries *)}
\smallskip\noindent
\phantom{\Begin}\For  $k = 0$ \To $vbounds-1$ \Do \newline
\phantom{\Begin\For}\For  $p = 0$ \To $vlength-1$ \Do \newline
\phantom{\Begin\For\If}\If $(row = p)$ \Then \quad {\rm(* merge two
regions only if they
 were not merged before *)}\newline
\phantom{\Begin\For\If\If}\If $((label)^{[i,vloc-1]} =
(label)^{[i,vloc]}) \wedge
 ((update)^{[i,vloc]} = 0)$ \Then \newline
\phantom{\Begin\For\If\If\If}{\rm(* combine and broadcast the two areas
and their label *)}\newline
\phantom{\Begin\For\If\If\If}$new\_area = (area)^{[i,vloc-1]} +
(area)^{[i,vloc]}$; \newline
\phantom{\Begin\For\If\If\If}$(n\_area)^{[i,*]} = new\_area$; \newline
\phantom{\Begin\For\If\If\If}$(L)^{[i,*]} = (label)^{[i,vloc]}$;
\newline
\phantom{\Begin\For\If\If}\Endif \newline
\phantom{\Begin\For\If}\Endif \newline
\phantom{\Begin\For\If}\colm \quad {\rm(* pick up new area from
appropriate rows *)}\newline
\phantom{\Begin\For\If}$b = strip * vlength + p$; \newline
\phantom{\Begin\For\If}$new\_area = (n\_area)^{[b,i]}$; \newline
\phantom{\Begin\For\If}$area\_label = (L)^{[b,i]}$; \newline
\phantom{\Begin\For\If}\rowm \quad {\rm(* update areas in both regions
*)} \newline
\phantom{\Begin\For\If}\For $(b = vloc-vlength)$ \To $(vloc+vlength-1)$
\Do \newline
\phantom{\Begin\For\If\If}\If $((label)^{[i,b]} = area\_label)$ \Then
\newline
\phantom{\Begin\For\If\If\If}$(area)^{[i,b]} = new\_area$; \newline
\phantom{\Begin\For\If\If\If}$(update)^{[i,vloc]} = 1$; \quad {\rm(* set
the update flag *)}\newline
\phantom{\Begin\For\If\If}\Endif \newline
\phantom{\Begin\For\If}\Endfor \newline
\phantom{\Begin\For}\Endfor \newline
\phantom{\Begin\For}$vloc = vloc + 2*vlength$; \newline
\phantom{\Begin}\Endfor\newline
\phantom{\Begin}$vloc = vlength$; \newline
\phantom{\Begin}\For  $k = 0$ \To $vbounds-1$ \Do \newline
\phantom{\Begin\For}$(update)^{[i,vloc]} = 0$; \quad {\rm(* reset the
update flag *)}\newline
\phantom{\Begin\For}$vloc = vloc + 2*vlength$; \newline
\phantom{\Begin}\Endfor\newline
\newline
{\rm(* Similarly do it for vertical merging across all horizontal
boundaries
 *)}\newline
$\ldots$ \newline
$\ldots$
\medskip\noindent
\end{figure*}

The running time for Algorithm 4 is same as that for Algorithm 2, i.e.,
$O(N^2)$ for an image of size $N \times N$ on ACMAA of size $N$. If $C$
is the number of regions (connected components) in an $N \times N$
image, then the summation pyramid approach gives better performance
compared to the split-and-merge approach for the images in which $C \leq
N$.

\section{Perimeter of a region}
The perimeter of a region or a connected component is defined as the
total length of the region boundary, i.e., it is the total number of
boundary pixels in a region. These boundary pixels include the pixels
belonging to the external as well as internal (surrounding a hole)
boundaries of a region. In a binary picture where background is
represented by white pixels, perimeter is given by the total number of
black pixels of a region adjacent to a white pixel. In this section two
methods for computing perimeters of the regions in an image are
proposed.

\subsection{Summation pyramid approach}
The summation pyramid approach used for area computation can be used to
compute the perimeter of a region labeled $L$ as follows. In the first
step we mark all the boundary pixels having label $L$. A boundary pixel
of a region is a pixel having at least one non-region pixel (i.e. a
pixel whose label is not equal to $L$) adjacent to it in the $3 \times
3$ neighborhood around it. Then we add these marked pixels in a
summation pyramid to obtain the perimeter of a region labeled $L$ from
the memory module $M_{00}$. Hence, for an image with $C$ regions, it
takes $O(CN)$ time on ACMAA of size $N$ to compute all the perimeters.

\subsection{Split-and-merge approach}
The bottom-up, divide-and-conquer approach similar to the one described
in Sects. 4 and 5.2 can also be used to compute the perimeters of all
regions in an $N \times N$ image on ACMAA of size $N$. This is done as
follows. We first partition the image into subimages, then we compute
perimeters in each subimage and finally we merge the subimages by adding
up the perimeters of regions with identical labels extending across the
boundary between the subimages. When we merge perimeters in two
subimages the perimeter pixels on the common boundary cease to be the
boundary pixels because now they become internal pixels of the merged
regions.

Initially all the non-background pixels are marked as boundary pixels
and their variable $perimeter$ is initialized to $1$. Then in each
successive iteration we merge horizontally and vertically as mentioned
before. However, in the merge step we do the following. We scan the
common boundary between the subimages to be merged looking for a pair of
4-connected boundary pixels. Once such a pair is found we merge the
perimeters and then re-mark them appropriately as boundary or
non-boundary pixels by looking at the $3 \times 3$ neighborhood around
each pixel. When we merge perimeters of two regions with the same label
$L$ in the two subimages we update the variable $perimeter$ of all the
boundary (marked) pixels in both the subimages by the sum of two
perimeters. However, when a pixel on the common boundary ceases to be a
boundary pixel, we set its $perimeter$ variable to $0$ and subtract one
from that of the other boundary pixels with the same label. After all
the $\log N$ horizontal and vertical merges are performed we will have
an output image in which all the marked pixels belong to boundaries of
regions and the variable $perimeter$ in these marked pixels correspond
to the length of the boundary to which they belong.

Hence, in $O(N^2)$ time steps we can mark all the boundary pixels and
compute the perimeter of all regions in an $N \times N$ image on ACMAA
of size $N$. As mentioned before, the summation pyramid approach has a
better performance compared to the split-and-merge approach for an $N
\times N$ image in which the number of region $C \leq N$.

\section{Moments}
In computer vision moments are used as descriptors of shapes in an
image. For a two-dimensional discrete function $f(x,y)$ the moment of
order $(p+q)$ is defined by
\begin{equation}
m_{pq} = \sum_x \sum_y x^p y^q f(x,y) \; .
\end{equation}
For a binary image, $f(x,y)$ has a value $1$ inside the region and $0$
elsewhere. This function reflects the shape of the object, and it has a
unique set of moments (Gonzalez and Wintz 1977; Ibrahim 1984). The area
of a region is given by its {\it zero} order $(p=q=0)$ moments
($m_{00}$). $m_{00}$ can be used to obtain the coordinates of the
centroid $(\bar{x},\bar{y})$ of a region.
\begin{equation}
\bar{x} = {m_{10}\over m_{00}},\quad \bar{y} = {m_{01}\over m_{00}}\; .
\end{equation}
Moments of a shape can be made size-invariant by dividing them by its
area ($m_{00}$). They can be made position-invariant by computing
{\it central moments} defined as follows:
\begin{equation}
\mu_{pq} = \sum_x \sum_y (x-\bar{x})^p (y-\bar{y})^q f(x,y)\; .
\end{equation}
Using these moments it is possible to derive a set of {\it normalized
central moments} which are invariant to translation, rotation and scale
change (Gonzalez and Wintz 1977).

Moments $m_{pq}$ of a shape labeled $L$ can be obtained by a two step
ACMAA algorithm as follows: in the first step each processor computes
the product $x^py^q$ in the column mode (where $x=i$ and $y=j$ for
processor $P_i$ accessing the $j^{th}$ module in its column memory).
This can be done in $O(N)$ time steps for an image of size $N \times N$
on ACMAA of size $N$. In the second step we first multiply these
products with their respective function values $f(x,y)$ (where,
$f(x,y)=\{1|label(x,y)=L\}$) and then add these new products over the
whole image using the summation pyramid approach described earlier.
Hence, the complete algorithm takes $O(N)$ time steps to compute $(p+q)$
order moments of a shape (region) labeled $L$. Note that for computing
the moments $m_{pq}$ for all the regions in an image the first step
needs to be executed only once because the product $x^py^q$ does not
depend on whether the point $(x,y)$ lies inside a region or elsewhere.

\section{Convex hull of a region}
In this section we describe an algorithm to determine vertices of the
convex hull of a region consisting of pixels with the same label. Convex
hull of a region is the smallest convex polygon containing all the
pixels belonging to the region. We assume that we have a labeled image
stored in the memory array, one pixel per module. Given a set $S$ of
pixels with label $L$, we define $Hull(S)$ as a set of pixels of $S$
that are vertices of the convex hull. Sides of this convex polygon can
be easily determined from this set of vertices (also refered to as {\it
extreme points of} $S$). Miller and Stout (1985) show that many queries
concerning $S$ can be reduced to questions concerning the extreme points
of $S$. Fig.\ts 4a shows convex hull of a region consisting of pixels
with the same label (shaded).

\setcounter{figure}{3}
\begin{figure}
\vspace{9cm}
\caption[]{Determining extreme points of the convex hull of a region}
\end{figure}

\begin{figure*}
\noindent%
\hrule
\medskip\noindent
{\bf Algorithm 5:} Extreme points of convex hull of a region
\medskip\noindent
\Forall\ {\it processors} $P _i$, $(0  \leq  i  \leq  N-1)$ \Dopar
\newline
\Begin  \newline
\phantom{\Begin}\colm \newline
\phantom{\Begin}$top = \infty$; \hskip 0.7in {\rm (* Initializations *)}
\newline
\phantom{\Begin}$bottom = -\infty$;
\smallskip\noindent
\phantom{\Begin}{\rm (* Determine extreme points for each column of
region with label $L$
 *)}\smallskip\noindent
\phantom{\Begin}\For  $j = 0$ \To $N-1$ \Do \newline
\phantom{\Begin\For}\If $(top = \infty) \wedge ((label)^{[j,i]} = L)$
\Then \newline
\phantom{\Begin\For\If}$top = j$; \newline
\phantom{\Begin\For}\If  $((label)^{[j,i]} = L)$ \Then \newline
\phantom{\Begin\For\If}$bottom = j$;  \newline
\phantom{\Begin\For}$(Hull)^{[j,i]} = .false.$; \hskip 0.5in {\rm(*
Initialize $Hull(S)$ *)}\newline
\phantom{\Begin}\Endfor \newline
\phantom{\Begin}\rowm \hskip 0.8in {\rm (* Broadcast $top$ and $bottom$
to all columns *)}\newline
\phantom{\Begin}$(Top)^{[i,*]} = top$; \newline
\phantom{\Begin}$(Bottom)^{[i,*]} = bottom$;
\smallskip\noindent
\phantom{\Begin}{\rm (* Determine if the column extreme points belong to
$Hull(S)$ *)}
\smallskip\noindent
\phantom{\Begin}\colm \newline
\phantom{\Begin}{\rm (* Check the bottom points with $bottom \neq
-\infty$ *)}\newline
\phantom{\Begin}$\alpha = \infty$; \newline
\phantom{\Begin}\For $j = i+1$ \To $N-1$ \Do \quad {\rm(* points on the
right *)}\newline
\phantom{\Begin\For}$\delta x = j - i$; \newline
\phantom{\Begin\For}$\delta y = bottom - (Bottom)^{[j,i]}$; \newline
\phantom{\Begin\For}$\theta = \tan^{-1}({\delta y\over \delta x})$;
\newline
\phantom{\Begin\For}$\alpha = \min\{ \alpha, \theta \}$; \quad {\rm (*
$bottom$ point with
 most negative (smallest) $\alpha$ *)}\newline
\phantom{\Begin}\Endfor
\smallskip\noindent
\phantom{\Begin}$\gamma = -\infty$;\newline
\phantom{\Begin}\For $j = i-1$ \To $0$ \Step $-1$ \Do \quad {\rm(*
points on the left *)}\newline
\phantom{\Begin\For}$\delta x = j - i$; \newline
\phantom{\Begin\For}$\delta y = bottom - (Bottom)^{[j,i]}$; \newline
\phantom{\Begin\For}$\theta = \tan^{-1}({\delta y\over \delta x})$;
\newline
\phantom{\Begin\For}$\gamma = \max\{ \gamma, \theta \}$; \quad {\rm (*
$bottom$ point with
 most positive (largest) $\gamma$ *)}\newline
\phantom{\Begin}\Endfor
\smallskip\noindent
\phantom{\Begin}$\beta = 180^\circ + \gamma - \alpha$; \newline
\phantom{\Begin}\If  $(\beta < 180^\circ)$ \Then \newline
\phantom{\Begin\For}$(Hull)^{[bottom,i]} = .true.$;
\smallskip\noindent
\phantom{\Begin}{\rm (* Check the top points with $top \neq \infty$
*)}\newline
\phantom{\Begin}$\alpha = -\infty$; \newline
\phantom{\Begin}\For $j = i+1$ \To $N-1$ \Do \quad {\rm(* points on the
right *)}\newline
\phantom{\Begin\For}$\delta x = j - i$; \newline
\phantom{\Begin\For}$\delta y = top - (Top)^{[j,i]}$; \newline
\phantom{\Begin\For}$\theta = \tan^{-1}({\delta y\over \delta x})$;
\newline
\phantom{\Begin\For}$\alpha = \max\{ \alpha, \theta \}$; \quad {\rm (*
$top$ point with most
 positive (largest) $\alpha$ *)}\newline
\phantom{\Begin}\Endfor\newline
\phantom{\Begin}$\gamma = \infty$;\newline
\phantom{\Begin}\For $j = i-1$ \To $0$ \Step $-1$ \Do \quad {\rm(*
points on the left *)}\newline
\phantom{\Begin\For}$\delta x = j - i$; \newline
\phantom{\Begin\For}$\delta y = top - (Top)^{[j,i]}$; \newline
\phantom{\Begin\For}$\theta = \tan^{-1}({\delta y\over \delta x})$;
\newline
\phantom{\Begin\For}$\gamma = \min\{ \gamma, \theta \}$; \quad {\rm (*
$top$ point with most
 negative (smallest) $\gamma$ *)}
\medskip\noindent
\end{figure*}
\begin{figure*}
\medskip\noindent
\phantom{\Begin}\Endfor
\smallskip\noindent
\phantom{\Begin}$\beta = 180^\circ - \gamma + \alpha$; \newline
\phantom{\Begin}\If  $(\beta < 180^\circ)$ \Then \newline
\phantom{\Begin\For}$(Hull)^{[bottom,i]} = .true.$;
\smallskip\noindent
\Endforall
\medskip\noindent
\hrule
\end{figure*}

In order to determine if a pixel of region $S$ belongs to {\it
Hull}$(S)$, we make use of the following property of an extreme point of
a convex hull: let, $x \in S$ be a point and $\beta$ be the smallest
angle that encloses $S$ subtended at $x$, then, $x \in \hbox{\it
Hull}(S)$ iff $\beta < 180^\circ$. For example, in Fig.\ts 4a point $p
\in \hbox{\it Hull}(S)$ because $\beta < 180^\circ$ whereas, points $q$
and $r$ in Fig.\ts 4b and c do not belong to {\it Hull}$(S)$ because
$\beta \geq 180^\circ$.

In our algorithm each processor first determines the extreme points in
its column. These points are likely candidates for extreme points of the
convex hull. On an $N \times N$ array of pixels, $S$ may have $O(N^2)$
points, but since $S$ has at most two extreme points in any column, it
has $O(N)$ extreme points. Each processor broadcasts its extreme points
to all other processors which then determine if their extreme points
belong to {\it Hull}$(S)$ using the property mentioned above. Algorithm
5 describes a procedure to determine {\it Hull}$(S)$ for a region $S$,
which is made up of pixels with label $L$.

In Algorithm 5 each processor determines its topmost and bottomost
points in $O(N)$ time steps by scanning its column memory. Then each
processor broadcast its $top$ and $bottom$ values to all its row modules
using the broadcast mode. This is done in $O(1)$ time. Finally, each
processor determines whether the extreme points of its column belong to
{\it Hull}$(S)$ by scanning the points on its right and left. This takes
$O(N)$ time steps. Therefore, extreme points of the convex hull of a
region can be identified in $O(N)$ time steps.


\section{Hough transform}
Hough transform is a technique used to extract straight line segments in
an image. Line segments are frequently used as key features in
constructing image models. This method involves transforming each image
point into a sinusoidal curve in a {\it parameter space} defined by Duda
and Hart 1972.
\begin{equation}
\rho = x\cos\theta + y\sin\theta\; .
\end{equation}
In this parameterization $(x, y)$ is a point on the line, $\rho$ is its
normal distance from the origin and $\theta$ is the angular position of
the normal. This parameter space is also referred to $\rho - \theta$
{\it plane} or {\it Hough space}.

In order to keep the parameters for a line unique, $\theta$ is
restricted to the interval $[0, \pi]$ (Duda and Hart 1972). This
restriction gives us a unique point in the parameter plane for every
line in the image plane. Hence, for an $N \times N$ image and $0 \leq
\theta < \pi$ we get, $-\sqrt{2}N \leq \rho \leq \sqrt{2}N$. Let the
parameter plane be quantized into a quadruled grid, and the quantized
region be treated as a two-dimensional accumulator array. For each point
$(x_i, y_i)$ in the image plane, the corresponding curve in the
parameter plane is entered in this array by incrementing the count in
all cells along the sinusoid. The peak count $k$ in a cell $(\theta_i,
\rho_j)$ implies that there are $k$ collinear points along the line
whose parameters are $(\theta_i, \rho_j)$.

Let $\Delta\theta$ and $\Delta\rho$ be the resolutions for $\theta$ and
$\rho$, then the size of the Hough accumulator array is $R \times
\Theta$, where
\begin{equation}
\Theta = {\pi\over \Delta\theta} \quad {\rm and}\quad   R =
 {2\sqrt{2}N+1\over \Delta\rho}
\end{equation}
$\rho$ can be quantized in the same manner as $x$ and $y$, whereas
$\Delta\theta$ is chosen as a compromise among speed, space and
accuracy.

The line extraction algorithm based on Hough transform can be divided
into two parts: computation of the count values in the Hough array, and
determination of the parameter cells whose counts exceed a certain
threshold. Computation of the Hough array is done sequentially for all
values of $\theta$. For each value of $\theta$ a processor $P_i$
computes the contributions of all the pixels stored in its column memory
and stores the results in a vector $r_{\theta}$ which is of size $R$.
These column vectors are then added up and the resulting vector is
stored in the memory column corresponding to the value of $\theta$.
Algorithm 6 describes the details of the Hough transform procedure for
extracting line segments.

The second part of the line extraction algorithm is responsible for
reporting the detected lines in the input image ($X$) by post-processing
the Hough array. This can be achieved by a number of ways, for example,
it can be done by reporting the contents of the Hough cells that are
greater than a certain threshold and their corresponding $(\rho,\theta)$
values; or lines can be shown graphically in an output image. For the
first method each processor can scan its row and report the values that
exceed a threshold value. Procedure for the second approach is slightly
elaborate and it is described below in Algorithm 6. The procedure is as
follows: For each value of $\theta$ we first broadcast the Hough array
column to all the memory array columns. Then, each processor marks its
pixels ``1'' if they contribute to the Hough accumulator cell whose
count exceeds a certain threshold. Once a pixel is marked ``1'' it is
never made ``0''. The output image ($L$) thus created has the extracted
straight lines of the input image. Sometimes this simple thresholding of
the Hough array may not be enough to extract lines because there can be
a lot of clustered cells in the Hough array exceeding the threshold. In
that case we may need to further process the array using a local maxima
computation and then report the line segments. Local maxima computations
on the ACMAA can be done using local neighborhood operations similar to
those described in Balsara and Irwin (1991).

The first section of Algorithm 6 computes the columns of the Hough array
for each value of $\theta$ sequentially. Hence, we perform $\Theta$ many
iterations. In an iteration $t$ every processor computes the
contributions of pixels stored in its column memory and stores the
results in the vector $r_{\theta}$. This takes $O(N)$ time. Finally,
addition of all these vectors and storing the result in the column
corresponding to $\theta_t$ takes $O(N)$ time. Hence, the Hough array is
created in $O(\Theta N)$ time. On the similar lines extraction of
straight lines in the second section also takes $O(\Theta N)$ time.
Hence, the total time for Algorithm 6 is $(\Theta N)$.

A sequential algorithm takes $O(\Theta N^2)$ time steps to compute the
Hough transform of an $N \times N$ image. Hence, our algorithm achieves
a linear speed-up and that, it is optimal for $N$ processors. The ACMAA
algorithm achieves the same running time as the mesh algorithm described
in Silberberg (1985). However, we use only $N$ processors compared to
$N^2$ processors in the mesh.

\section{Discussion}
Algorithms described in this paper are simple and easy to understand.
There are no complex pointer manipulations or data routing schemes. All
operations are performed without any reformatting or serialization of
the input image. This leads to a straightforward implementation with
limited global control. Also, it results in a time efficient
implementation for the tasks in which more than one operation are
performed on an input image because no rearranging or input/output of
the intermediate results is needed. All of the algorithms work on both,
binary as well as gray scale images. However, the computation time for
the latter will be more than that for the former because of the increase
in operand size.

\begin{figure*}
\hrule
\medskip\noindent
{\bf Algorithm 6:} Hough transform
\medskip\noindent
{\rm(* Array X = edge image, array L = line image. Hough array of size
$R
 \times \Theta$ is }\newline
{\rm\ \ \ stored in block storage fashion; i.e., more than one element
per
 memory module. }\newline
{\rm\ \ \ $R = 2\sqrt{2}N+1$, $\Theta = $total no. of $\theta$s,
$\Delta\theta =
 $resolution for $\theta$ *)}
\smallskip\noindent
\Forall\ {\it processors} $P _i$, $(0  \leq  i  \leq  N-1)$ \Dopar
\newline
\Begin  \newline
\phantom{\Begin}\colm  \hskip 1.5in {\rm(* Initializations *)} \newline
\phantom{\Begin}$r = \lc {R\over N} \rc$; \newline
\phantom{\Begin}$s = \lc {\Theta\over N}\rc$; \newline
\phantom{\Begin}\For  $j=0$ \To $(N-1)$ \Do \newline
\phantom{\Begin\For}$(L)^{[j,i]} = 0$;  \newline
\phantom{\Begin\For}\For  $k=0$ \To $(r-1)$ \Do \newline
\phantom{\Begin\For\If}\For  $l=0$ \To $(s-1)$ \Do \newline
\phantom{\Begin\For\If\If}$(H)^{[j,i]}\la k,l\ra = 0$;\newline
\phantom{\Begin\For\If\If}$(r_{\theta})^{[j,i]}\la k\ra = 0$; \newline
\phantom{\Begin\For\If}\Endfor\newline
\phantom{\Begin}\Endfor
\smallskip\noindent
\phantom{\Begin}{\rm(* $\forall$ pixels compute $\rho$ values for every
angle $\theta$ *)}
\smallskip\noindent
\phantom{\Begin}\For  $t=0$ \To $(\Theta -1)$ \Do \newline
\phantom{\Begin\For}$\theta = t.\Delta\theta$; \newline
\phantom{\Begin\For}\For $j=0$ \To $(N-1)$ \Do \quad {\rm(* $\forall$
pixels in column $i$
 *)}\newline
\phantom{\Begin\For\If}\If  $((X)^{[j,i]} = edge)$ \Then \newline
\phantom{\Begin\For\If\If}$\rho = i\cos\theta + j\sin\theta$; \newline
\phantom{\Begin\For\If\If}$p = \lf (\rho + \sqrt{2}N) \rf$; \newline
\phantom{\Begin\For\If\If}$(r_{\theta})^{[\lf{p\over r}\rf,i]}\la
p\;
{\rm mod}\; r\ra =
 (r_{\theta})^{[\lf{p\over r}\rf,i]}\la p\; {\rm mod}\; r\ra +
1$;\newline
\phantom{\Begin\For\If}\Endif\newline
\phantom{\Begin\For}\Endfor
\smallskip\noindent
\phantom{\Begin\For}{\rm(* Add similar $\rho$'s and store them in the
column corresponding to
 $\theta_t$ *)}
\smallskip\noindent
\phantom{\Begin\For}\rowm \newline
\phantom{\Begin\For}$col = \lf{\theta\over s}\rf$;\newline
\phantom{\Begin\For}$l = \theta \; {\rm mod}\; s$; \newline
\phantom{\Begin\For}\For  $k=0$ \To $(r-1)$ \Do \newline
\phantom{\Begin\For\If}\For  $j=0$ \To $(N-1)$ \Do \newline
\phantom{\Begin\For\If\If}$(H)^{[i,col]}\la k,l\ra = (H)^{[i,col]}\la
k,l\ra +
 (r_{\theta})^{[i,j]}\la k\ra$; \newline
\phantom{\Begin\For\If\If}$(r_{\theta})^{[i,j]}\la k\ra = 0$;
\quad{\rm(* reset for next
 $\theta$ *)} \newline
\phantom{\Begin\For}\Endfor\newline
\phantom{\Begin\For}\colm \newline
\phantom{\Begin}\Endfor
\smallskip\noindent
$\>${\rm(* Extract lines by enhancing those pixels that contribute to
}\newline
\phantom{\Begin\For\If}$\>${\rm\ \ \ accumulator cells with counts
exceeding the threshold.\ \ \ *)}

\smallskip\noindent
\phantom{\Begin}\For  $t=0$ \To $(\Theta -1)$ \Do \newline
\phantom{\Begin\For}$\theta = t.\Delta\theta$; \newline
\phantom{\Begin\For}$col = \lf{\theta\over s}\rf$;\newline
\phantom{\Begin\For}$l = \theta \; {\rm mod}\; s$; \newline
\phantom{\Begin\For}\rowm \hskip 1in{\rm(* broadcast $\rho$'s for this
$\theta$ *)} \newline
\phantom{\Begin\For}\For  $k=0$ \To $(r-1)$ \Do \newline
\phantom{\Begin\For\If}$(r_{\theta})^{[i,*]}\la k\ra = (H)^{[i,col]}\la
k,l\ra$;
\newline
\phantom{\Begin\For}\colm \newline
\phantom{\Begin\For}\For  $j=0$ \To $(N-1)$ \Do\newline
\phantom{\Begin\For\If}$\rho = i\cos\theta + j\sin\theta$; \newline
\phantom{\Begin\For\If}$p = \lf (\rho + \sqrt{2}N) \rf$; \newline
\phantom{\Begin\For\If}\If  $((r_{\theta})^{[\lf{p\over r}\rf,i]}\la p\;
{\rm mod}\; r\ra \geq
 THRESH)$ \Then \newline
\phantom{\Begin\For\If\If}$(L)^{[j,i]} = 1$; \newline
\phantom{\Begin\For}\Endfor\newline
\phantom{\Begin}\Endfor\newline
\Endforall
\medskip\noindent
\hrule
\end{figure*}

Table 1 summarizes all the algorithms (for $N \times N$ images)
described in this paper and compares the ACMAA time complexities with
those of the other architectures. A comparison of these architectures
with ACMAA is described in Table 2.

All the above algorithms perform their operations on images of size $N
\times N$ using ACMAA of size $N$. They can be easily generalized for
large size images ($N > n$, where $n$ is the size of ACMAA). However,
this requires memory modules of larger size, and will slow down the
algorithms by a constant factor. This modification is done by storing an
image block of size $\lc {N\over n} \rc \times \lc {N\over n} \rc$ in
each memory module, so that each processor is responsible for a strip of
the image rather than a one pixel wide row or column. Similar blocking
scheme is employed for storing the histogram if $G > n$ and the Hough
accumulator array. Hence, the algorithms described above can be used for
images whose size is not limited by the number of processors in the
ACMAA. This scheme is useful since it is not practical to construct
ACMAA with a large number of processors because of performance
degradation due to long buses. A modified pseudo-code for the case when
$N > n$ and $G > n$ in the histogram algorithm described in Sect. 3 is
given below. In this algorithm, the input image and the histogram array
are stored using a block storage scheme. In this scheme we store a
subimage of size $k \times k$ (where $k = \lc N/n \rc$) in each memory
module. Hence, the element $x_{ij}$ of an image $X$ is stored in the
memory module (subimage), $M_{\lf i/k \rf \lf j/k \rf}$ at the block
location $\la i \; {\rm mod}\; k, j \; {\rm mod}\; k \ra$. In the
pseudo-code below, the notation $(X)^{[i,j]}\la r,s \ra$ corresponds to
the element of an array $X$ stored in the memory module $M_{ij}$ at the
location $(r,s)$.

\begin{table*}
\caption[]{Comparison of algorithms on various architectures}
\halign{\strut
#\enspace \hfill & \enspace # \hfill\enspace &
\enspace #\enspace \hfill &
\enspace #\enspace \hfill &
\enspace # \hfill  \cr
\noalign{\smallskip\hrule\smallskip}
& \multispan4{Running time \hfill} \cr
& \multispan4{\hrulefill} \cr
  Algorithm    & Sequential & 2d Mesh & Tree
& ACMAA \cr
& machine    &  & NON-VON$^1$ & \cr
\noalign{\smallskip\hrule\smallskip}
Histogramming & $O(N^2)$ & $O(N)$ & $O(G+\log N)$ & $O(N)$
\cr
Connected Component Labeling  & $O(N^2)$ & $O(N)$ & $O(N)^2$ &
 $O(N^2)$ \cr
Area of a Region & $O(N^2)$ & $O(N)$ & $O(\log N)$ & $O(N)$ \cr
Perimeter of a Region & $O(N^2)$ & $O(N)$ & $O(\log N)$ & $O(N)$ \cr
Moments of a Region & $O(N^2)$ & $O(N)$ & $O(\log N)$ & $O(N)$ \cr
Convex Hull of a Region       & $O(N^2)^3$ & $O(N)$ &  -- &
 $O(N)$ \cr
Hough Transform & $O(\Theta N^2)$ & $O(\Theta N)$ & -- &$O(\Theta
 N)$ \cr
\noalign{\smallskip\hrule}}
\smallskip\noindent
$^1$Computed from Ibrahim 1987\newline
$^2$This is the average case running time based
 on the fact that the average number
 of\\
  black rectangles in the binary
 image tree is $O(N)$ (Ibrahim 1987)\newline
$^3$The input is given as a 2-d array of points
 and not as a list of $(x,y)$
coordinates of points
\end{table*}

\begin{table}
\caption[]{Comparison of different architectures}
\halign{\strut
#\enspace \hfill&
\enspace #\hfill\enspace &
\enspace #\hfill\enspace  &
\enspace #\hfill\enspace  &
\enspace #\hfill  \cr
\noalign{\hrule\smallskip}
 & 2d Mesh & Binary & Pyramid & ACMAA \cr
 &               & Tree &  & \cr
\noalign{\smallskip\hrule\smallskip}
\# Processors & $O(n^2)$ & $O(n^2)$ & $O(n^2)$ & $O(n)$ \cr
\noalign{\smallskip\hrule\smallskip}
Memory size & $O(n^2)$ & $O(n^2)$ & $O(n^2)$ & $O(n^2)$ \cr
\noalign{\smallskip\hrule\smallskip}
Diameter & $O(n)$ & $O(\log n)$ & $O(\log n)$ & $O(1)$ \cr
\noalign{\smallskip\hrule}}
\end{table}

\begin{figure*}
\hrule
\medskip\noindent
{\bf Algorithm 7:} Image histogram for $N > n$, and $G > n$
\medskip\noindent
{\rm (* Initially we store the histogram bucket} $H_m$ {\rm in the
column
 module,} $M_{i,\lf m/k \rf}$\newline
{\rm$~~~$ of all the} $n$ {\rm rows} *)\newline
\Forall\ {\it processors} $P _i$, $(0  \leq  i  \leq  n-1)$ \Dopar
\newline
\Begin  \newline
\phantom{\Begin}\rowm \newline
\phantom{\Begin}$w = \lc {G\over n} \rc$; \newline
\phantom{\Begin}\For  $p=0$ \To $(w-1)$ \Do \newline
\phantom{\Begin\For}$(H)^{[i,*]}\la p \ra = 0$;\hskip 0.5in {\rm (*
global initialization in
 each row *)}\newline \newline
\phantom{\Begin}\For $j=0$ \To $(n-1)$ \Do \hskip 0.1in {\rm (*
determine histogram for row
 of sub-images *) }\newline
\phantom{\Begin\For}\For  $p=0$ \To $(\lc {N\over n} \rc - 1)$ \Do
\newline
\phantom{\Begin\For\If}\For  $q=0$ \To $(\lc {N\over n} \rc - 1)$ \Do
\newline
\phantom{\Begin\For\If\If}$g = (X)^{[i,j]}\la p,q \ra$;\newline
\phantom{\Begin\For\If\If}$(H)^{[i,\lf g/w \rf]}\la g \; {\rm mod}\; w
\ra = (H)^{[i,\lf g/w \rf]}\la g
 \; {\rm mod}\; w \ra + 1$;\newline
\phantom{\Begin\For\If}\Endfor \newline
\phantom{\Begin\For}\Endfor \newline
\phantom{\Begin}\Endfor \newline
\newline
\phantom{\Begin}\colm \hskip 0.3in {\rm (* sum up the columns having
histogram array,} $H$
 {\rm *) } \newline
\phantom{\Begin}\If  $(i \leq \lf {G-1\over n} \rf)$ \Then \newline
\phantom{\Begin\For}\For  $j = 0$ \To $n-1$ \Do \newline
\phantom{\Begin\For\If}\For  $p=0$ \To $(w-1)$ \Do \newline
\phantom{\Begin\For\If\If}\If  $(j \neq i)$ \Then \newline
\phantom{\Begin\For\If\If\If}$(H)^{[i,i]}\la p \ra = (H)^{[i,i]}\la p
\ra + (H)^{[j,i]}\la p
 \ra$; \newline
\phantom{\Begin\For\If}\Endfor \newline
\phantom{\Begin\For}\Endfor \newline
\phantom{\Begin}\Endif \newline
\Endforall
\medskip\noindent\hrule
\end{figure*}

ACMAA's efficient local and global communication schemes make it
suitable for neighborhood processing (Balsara and Irwin 1991) as well as
for global aggregation of values at a number of image points. Hence, it
is applicable for a wide range of computer vision tasks. Also, it is
economical compared to the other architectures proposed for image
processing and computer vision, viz., mesh, tree, pyramid. It needs
fewer processors than these architectures. Table 2 gives a comparison of
ACMAA with these architectures (based on one pixel per processor or
memory module scheme for an $n \times n$ image).

Currently, we are in the process of designing and implementing a
4-processor prototype of ACMAA using off-the-shelf components. Since
ACMAA is an SIMD architecture it is controlled by a host processor which
is responsible for the global control of the architecture and
interaction with the outside world. This host processor ensures that all
processors execute the same set of instructions in a lock-step fashion,
thus guaranteeing SIMD mode of operation. Image input/output to/from the
memory array can be done by scanning an image frame buffer row by row,
left to right and using the row buses to access the memory array. The
control and addressing information in the input/output mode is generted
by the host. For high-speed operations one can construct ACMAA with more
than one memory array so that when the processors are operating on one
array, the host can perform input/output on the other.

For an ACMAA based on 64 Motorola 68008 processors with a 8MHz clock our
preliminary calculations predict a running time of 12.028\ts ms for the
histogram algorithm (Sect. 3) running on a $512 \times 512$ size image
memory with an access time of 200\ts ns. The corresponding time for a
sequential algorithm running on the same processor was 721.248\ts ms.
Note that these approximate timings are obtained by manually determining
the number and type of instructions needed to execute the algorithm.
Actual timings will depend upon the components used and their layout in
the system, especially that of the buses. Memory circuits used in the
architecture will play an important role in determining the performance
of ACMAA because too many accesses to the memory array may hinder its
use for high-speed computations. Massively parallel architectures, e.g.,
$O(N^2)$ mesh, are normally designed to have very few memory accesses.
However, they are inefficient for global communications, and since the
number of processors is large, an individual processor has limited local
memory. These reasons may cause deterioration in the performance of such
architectures for problems requiring large memories and efficient
communications. ACMAA, on the other hand, is efficient for local as well
as global communications but involves accesses to the memory array.
Hence, the choice of approach was mainly governed by the need to use an
architecture which will be suitable for a wide range of vision tasks.

\section{Conclusion}
Parallel algorithms for intermediate-level computer vision for the
Access Constrained Memory Array Architecture (ACMAA) have been presented
in this paper. The choice of algorithms demonstrates the applicability
of ACMAA to several computer vision and image analysis tasks that
involve local as well as global processing. This was done by exploiting
the row and column bus structure of the memory array, which led to
efficient local and global communications among the processors. Most of
the ACMAA algorithms achieve a linear speed up over sequential
algorithms and they are optimal for an $N$ processor system.

\begin{thebibliography}{References}
\bibitem{R1} Ballard D, Brown C (1982) Computer vision. Prentice-Hall
Inc., Englewood Cliffs, New Jersey
\bibitem{R2} Buehrer R (1982) The ETH-Multiprocessor EMPRESS: A
dynamically configurable MIMD system. IEEE Trans Comput
C-31(11):1035--1044
\bibitem{R3} Balsara PT, Irwin MJ (1991) Image processing on a memory
array architecture. VLSI Signal Proc (2):313--324
\bibitem{R4} Cypher R, Sanz JLC, Snyder L (1987) Practical algorithms
for image component labeling on SIMD mesh connected computers. In:
Proceedings of the International Conference on Parallel Processing, p
772--779
\bibitem{R5} Duff MJB, Fountain TJ (ed) (1986) Cellular logic image
processing. Academic Press, London, England
\bibitem{R6} Duda R, Hart P (1972) Use of the hough transform to detect
lines and curves in pictures. Commun ACM 15(1):11--15
\bibitem{R7} Danielsson P, Levialdi S (1981) Computer architectures for
pictorial information systems. IEEE Computer, pp 53--67
\bibitem{R8} Fountain TJ, Matthews KN, Duff MJB (1988) The CLIP7A Image
processor. IEEE Trans Pattern Anal Mach Intell 10(3):310--318
\bibitem{R9} Gonzalez R, Wintz P (1977) Digital image processing.
Addison-Wesley, Reading
\bibitem{R10} Hwang K, Fu K (1983) Integrated computer architectures for
image processing and database management. IEEE Computer, pp 51--60
\bibitem{R11} Hwang K, Tseng PS, Kim D (1989) An orthogonal
mutliprocessor for large-grain scientific computations. IEEE Trans
Comput 38(1)
\bibitem{R12} Ibrahim HAH (1984) Image understanding algorithms on
fine-grained tree-structured SIMD machines. PhD thesis, Computer Science
Dept., Columbia University
\bibitem{R13} Ibrahim HAH, Kender J, Shaw DE (1987) Low level image
analysis tasks on fine-grained tree-structured SIMD machines. Parallel
Distrib Comput 4:546--574
\bibitem{R14} Kennedy J (1989) Simulator for a memory array
architecture, M.S. Paper, Dept. of Computer Science, Penn State
University, University Park, PA 16802
\bibitem{R15} Lumia R, Shapiro L, Zumiga O (1983) A new connected
component algorithm for virtual memory computers. Comput Grap Image Proc
22:287--300
\bibitem{R16} Miller R, Stout QF (1985) Geometric algorithms for
digitized pictures on a mesh-connected computer. IEEE Trans Pattern Anal
Mach Intell, PAMI-7(2):216--228
\bibitem{R17} Nassimi D, Sahni S (1980) Finding connected components and
connected ones on a mesh connected parallel computer. SIAM J Comput
9(4):744--757
\bibitem{R18} Rosenfeld A, Kak AC (1982) Digital picture processing.
Academic Press, New York
\bibitem{R19} Silberberg T (1985) The Hough transform on the geometric
arithmetic parallel processor. In: Proceedings of the IEEE Workshop on
Computer Architecture for Pattern Analysis and Image Database
Management, pp 387--393
\bibitem{R20} Scherson I, Ma Y (1987) Vector computations on an
orthogonal memory access multiprocessing system. In: 8th IEEE Symposium
on Computer Arithmetic, pp 28--37
\bibitem{R21} Scherson I, Ma Y (1989) Analysis and applications of
orthogonal access multi-processor. J Parallel Distrib Comput
7(2):232--255
\bibitem{R22} Weems C (1984) Image processing on a content addressable
array parallel processor. Technical Report COINS Tech. Report 84-14,
University of Massachusetts, Amherst
\end{thebibliography}

\begin{biography}{Son Pham} is currently a professor of Computer
Science, College of Engineering and Computer Science at the California
State University at Northridge. His research interests include Computer
Graphics and Software Engineering. He has published a dozen refereed
academic/technical papers in Computer Science. He is a member of ACM.
Pham received his BA in Mathematics from the University of Saigon,
VietNam in 1973; MA in Mathematics from University of Louisville,
Kentucky in 1975; Ph.D. in Statistics from the University of Cincinnati,
Ohio in 1978; and PD in Computer Science from the University of
California, Berkeley in 1980.
\end{biography}

\begin{biography}{Mel Slater} joined the Department of Computer Science,
Queen Mary and Westfield College (University of London) in 1981. Since
1982 he has been involved in teaching and research in computer graphics.
He produced an early implementation of the GKS graphics standard, and
was involved in the ISO graphics group, with responsibility for the
PASCAL language binding to GKS. He co-authored an undergraduate textbook
on computer graphics, which was published in 1987. He is a principal
investigator in several funded research projects, in particular the
European Community ESPRIT funded SPIRIT Workstation project, and a
project on Virtual Reality with the London Parallel Applications Centre.
He was visiting professor in the Computer Science Division of Electrical
Engineering and Computer Sciences, University of California at Berkeley
in the spring semesters 1991 and 1992.
\end{biography}

\vfill
\newpage

\begin{biography}{W. Kenneth Stewart} is an assistant scientist at the
Deep Submergence Laboratory of the Woods Hole Oceanographic Institution.
His research interests include underwater robotics, autonomous vehicles
and smart ROVs, multisensor modeling, real-time acoustic and optical
imaging, and precision underwater surveying. Stewart has been going to
sea on oceanographic research vessels for 19 years, has developed
acoustic sensors and remotely-operated vehicles for 6000-m depths, and
has made several deep dives in manned submersibles, including a 4000-m
excursion to the {\it USS Titanic} in 1986. He is a member of the Marine
Technology Society, Oceanography Society, IEEE Computer Society, ACM
SIGGRAPH, and NCGA. Stewart received a PhD in oceanographic engineering
from the Massachusetts Institute of Technology and Woods Hole
Oceanographic Institution Joint Program in 1988, a BS in ocean
engineering from Florida Atlantic University in 1982, and an AAS in
marine technology from Cape Fear Technical Institute in 1972.
\end{biography}
\end{document}