@ARTICLE{derider02collagen,
author = {DeRider, Michele L. and Wilkens, Steven J. and Waddell, Michael J.
and Bretscher, Lynn E. and Weinhold, Frank and Raines, Ronald T.
and Markley, John L.},
title = {Collagen Stability: Insights from {NMR} Spectroscopic and Hybrid
Density Functional Computational Investigations of the Effect of
Electronegative Substituents on Prolyl Ring Conformations},
journal = {Journal of the American Chemical Society},
volume = {124},
number = {11},
year = {2002},
month = {Mar},
pages = {2497--2505},
abstract = {
Collagen-like peptides of the type (Pro-Pro-Gly)(10) fold into stable
triple helices. An electron-withdrawing substituent at the H(gamma)(3)
ring position of the second proline residue stabilizes these triple
helices. The aim of this study was to reveal the structural and energetic
origins of this effect. The approach was to obtain experimental NMR data
on model systems and to use these results to validate computational
chemical analyses of these systems. The most striking effects of an
electron-withdrawing substituent are on the ring pucker of the substituted
proline (Pro(i)) and on the trans/cis ratio of the Xaa(i-1)-Pro(i) peptide
bond. NMR experiments demonstrated that N-acetylproline methyl ester
(AcProOMe) exists in both the C(gamma)-endo and C(gamma)-exo conformations
(with the endo conformation slightly preferred),
N-acetyl-4(R)-fluoroproline methyl ester (Ac-4R-FlpOMe) exists almost
exclusively in the C(gamma)-exo conformation, and
N-acetyl-4(S)-fluoroproline methyl ester (Ac-4S-FlpOMe) exists almost
exclusively in the C(gamma)-endo conformation. In dioxane, the
K(trans/cis) values for AcProOMe, Ac-4R-FlpOMe, and Ac-4S-FlpOMe are 3.0,
4.0, and 1.2, respectively. Density functional theory (DFT) calculations
with the (hybrid) B3LYP method were in good agreement with the
experimental data. Computational analysis with the natural bond orbital
(NBO) paradigm shows that the pucker preference of the substituted prolyl
ring is due to the gauche effect. The backbone torsional angles, phi and
psi, were shown to correlate with ring pucker, which in turn correlates
with the known phi and psi angles in collagen-like peptides. The
difference in K(trans/cis) between AcProOMe and Ac-4R-FlpOMe is due to an
n-->pi interaction associated with the Burg-Dunitz trajectory. The
decrease in K(trans/cis) for Ac-4S-FlpOMe can be explained by
destabilization of the trans isomer because of unfavorable electronic and
steric interactions. Analysis of the results herein along with the
structures of collagen-like peptides has led to a theory that links
collagen stability to the interplay between the pyrrolidine ring pucker,
phi and psi torsional angles, and peptide bond trans/cis ratio of
substituted proline residues.},
url = {http://pubs.acs.org/journals/jacsat/article.cgi/jacsat/2002/124/i11/pdf/ja0166904.pdf},
pdf = {http://www.waddellinformatics.com/download/derider02collagen.pdf},
ps = {http://www.waddellinformatics.com/download/derider02collagen.ps.gz},
keywords = {nmr, collagen, nbo, proline}
}
@TECHREPORT{page02comparative,
author = {Page, David and Zhan, Fenghuang and Cussens, James
and Waddell, Michael and Hardin, Johanna and Barlogie, Bart
and Shaughnessy, Jr., John},
title = {Comparative Data Mining for Microarrays: A Case Study Based on
Multiple Myeloma},
institution = {Computer Sciences Department, University of Wisconsin},
number = {1453},
year = {2002},
month = {Nov},
abstract = {
Supervised machine learning and data mining tools have become popular for
the analysis of gene expression microarray data. They have the potential
to uncover new therapeutic targets for diseases, to predict how patients
will respond to specific treatments, and to uncover regulatory
relationships among genes in normal and disease situations. Comparative
experiments are needed to identify the advantages of the leading
supervised learning algorithms for microarray data, as well as to give
direction in methodological decisions. This paper compares support vector
machines, Bayesian networks, decision trees, boosted decision trees, and
voting (ensembles of decision stumps) on a new microarray data set for
cancer with over 100 samples. The paper provides evidence for several
important lessons for mining microarray data, including: (1) Bayes nets
and ensembles perform at least as well as other approaches but arguably
provide more direct insight; (2) the common practice of throwing out low
or negative average differences, or those accompanied by an absent call,
is a mistake; (3) looking for consistent differences in expression may
be more important than large differences.},
url = {ftp://ftp.cs.wisc.edu/pub/tech-reports/reports/2002/tr1453.ps.Z},
pdf = {http://www.waddellinformatics.com/download/page02comparative.pdf},
ps = {http://www.waddellinformatics.com/download/page02comparative.ps.gz},
keywords = {microarray, myeloma, genomics, algorithms, svms, bayes, trees,
ensembles, eov}
}
@INPROCEEDINGS{dutra03toward,
author = {Dutra, In{\^{e}}s de Castro and Page, David and Costa, Vitor
Santos and Shavlik, Jude W. and Waddell, Michael},
title = {Toward Automatic Management of Embarrassingly Parallel Applications},
booktitle = {Euro-Par 2003. Parallel Processing, 9th International Euro-Par
Conference, Klagenfurt, Austria, August 26-29, 2003.
Proceedings},
editor = {Harald Kosch and
L{\'{a}}szl{\'{o}} B{\"{o}}sz{\"{o}}rm{\'{e}}nyi and
Hermann Hellwagner},
publisher = {Springer-Verlag},
series = {Lecture Notes in Computer Science},
volume = {2790},
isbn = {3-540-40788-X},
month = {Aug},
year = {2003},
pages = {509--516},
abstract = {
Large-scale applications that require executing very large numbers of tasks
are only feasible through parallelism. In this work we present a system
that automatically handles large numbers of experiments and data in the
context of machine learning. Our system controls all experiments, including
re-submission of failed jobs and relies on available resource managers to
spawn jobs through pools of machines. Our results show that we can manage
a very large number of experiments, using a reasonable amount of idle CPU
cycles, with very little user intervention.},
url = {http://citeseer.nj.nec.com/599812.html},
pdf = {http://www.waddellinformatics.com/download/dutra03toward.pdf},
ps = {http://www.waddellinformatics.com/download/dutra03toward.ps.gz},
keywords = {collaboration, parallelism}
}
@ARTICLE{molla04using,
author = {Michael Molla and Michael Waddell and David Page and Jude Shavlik},
title = {Using Machine Learning to Design and Interpret Gene-Expression
Microarrays},
journal = {{AI} Magazine},
volume = {25},
number = {1},
year = {2004},
pages = {23--44},
abstract = {
Gene-expression microarrays, commonly called 'gene chips,' make it
possible to simultaneously measure the rate at which a cell or tissue is
expressing - translating into a protein - each of its thousands of genes.
One can use these comprehensive snapshots of biological activity to infer
regulatory pathways in cells, identify novel targets for drug design, and
improve the diagnosis, prognosis, and treatment planning for those
suffering from disease. However, the amount of data this new technology
produces is more than one can manually analyze. Hence, the need for
automated analysis of microarray data offers an opportunity for machine
learning to have a significant impact on biology and medicine. This
article describes microarray technology, the data it produces, and the
types of machine-learning tasks that naturally arise with this data. It
also reviews some of the recent prominent applications of machine learning
to gene-chip data, points to related tasks where machine learning may have
a further impact on biology and medicine, and describes additional types of
interesting data that recent advances in biotechnology allow biomedical
researchers to collect.},
url = {http://www.aaai.org/Papers/Magazine/Vol25/25-01/AIMag25-01-004.pdf},
pdf = {http://www.waddellinformatics.com/download/molla04using.pdf},
ps = {http://www.waddellinformatics.com/download/molla04using.ps.gz},
keywords = {bioinformatics, microarray, genomics}
}
@MISC{waddell00theoretical1,
author = {Waddell, Michael J.},
title = {Theoretical Analysis of the Basis of Collagen Stability},
note = {Hilldale Undergraduate/Faculty Research Seminar, Univerisity of
Wisconsin-Madison, Madison, Wisconsin, April 24, 2000},
howpublished = {Seminar},
year = {2000},
month = {Apr},
keywords = {collagen, nbo, proline}
}
@MISC{waddell00theoretical2,
author = {Waddell, Michael J.},
title = {Theoretical Analysis of the Basis of Collagen Stability},
note = {University of Wisconsin, Department of Biochemistry},
howpublished = {Senior Undergraduate Thesis},
year = {2000},
month = {May},
pdf = {http://www.waddellinformatics.com/download/waddell00theoretical2.pdf},
ps = {http://www.waddellinformatics.com/download/waddell00theoretical2.ps.gz},
keywords = {collagen, nbo, proline}
}
@MISC{waddell02comparative,
author = {Waddell, Michael and Page, David and Zhan, Fenghuang
and Barlogie, Bart and Shaughnessy, Jr., John and Hardin, Johanna
and Cussens, James},
title = {Comparative Data Mining for Microarrays: A Case Study Based on
Multiple Myeloma},
note = {International Conference on Intelligent Systems for Molecular
Biology, Poster Session 1, Edmonton, Alberta, Canada, August 4,
2002},
howpublished = {Poster},
year = {2002},
month = {Aug},
abstract = {
These studies compare SVMs, Bayesian networks, decision trees, boosted
decision trees and voting (ensembles of decision stumps) on a new
microarray data set for cancer (multiple myeloma) with over 100 samples.
They provide evidence for several important lessons about how these
techniques should be used for mining microarray data. },
pdf = {http://www.waddellinformatics.com/download/waddell02comparative.pdf},
ps = {http://www.waddellinformatics.com/download/waddell02comparative.ps.gz},
keywords = {genomics, microarray, myeloma, algorithms}
}
@MISC{waddell03seldi,
author = {Waddell, Michael J.},
title = {{SELDI Filter}: Automating the Filtering and Analysis of
Proteomic Mass Spectrometry Data},
note = {Abbott Laboratories Science Intern Poster Session, Abbott Park,
Illinois, July 23, 2003},
howpublished = {Poster},
year = {2003},
month = {July},
abstract = {
One of the technologies being used at Abbott for discovering biomarkers
for disease states, toxicity and treatment is Surface Enhanced Laser
Desorption and Ionization (SELDI) combined with time of flight (TOF)
mass spectrometry. SELDI combines highly specific sample enrichment
with a sensitive mass measurement, thereby allowing researchers to
discover very low abundant (down to 10 fmol) biomarkers. However, the
volume of data obtained with SELDI can quickly overwhelm the researcher's
ability to analyze manually. \\
\\
SELDI Filter is a program that I have developed during my internship at
Abbott to automate much of this tedious analysis process. It integrates
with the Ciphergen ProteinChip\copyright software that the researchers
are using to collect the SELDI data and presents its analysis results in
either Microsoft Word\copyright or Microsoft PowerPoint\copyright for
ease of integration into papers and presentations. \\
\\
SELDI Filter currently automates the following analysis methods: ANOVA,
Discriminant Analysis and Partition Analysis. However, due to its modular
design, other techniques can quickly and easily be added.},
pdf = {http://www.waddellinformatics.com/pubs/waddell03seldi.pdf},
ps = {http://www.waddellinformatics.com/pubs/waddell03seldi.ps.gz},
summary = {http://www.waddellinformatics.com/download/waddell03seldi-summary.txt},
keywords = {proteomics, seldi, spectrometry}
}
@MISC{waddell03toward,
author = {Michael J. Waddell},
title = {Toward the Development of Diagnostic Models Capable of
Distinguishing Multiple Myeloma, {MGUS}, and Normal Plasma Cells
Using Global Gene Expression Profiles},
note = {Computation and Informatics in Biology and Medicine (CIBM)
Seminar Series, Univerisity of Wisconsin-Madison, Madison, Wisconsin,
February 11, 2003},
howpublished = {Seminar},
year = {2003},
month = {Feb},
abstract = {
Standard laboratory classification of the plasma cell dyscrasia monoclonal
gammopathy of undetermined significance (MGUS) and the overt plasma cell
neoplasm multiple myeloma (MM) is quite accurate, yet, for the most part,
prognostically uninformative. Most, if not all, cancers are caused by
inherited or acquired genetic mutations that manifest themselves in
altered gene expression patterns in the clonally related cancer cells.
Microarray technology allows for qualitative and quantitative measurements
of the expression levels of thousands of genes simultaneously, and it has
now been used both to classify cancers that are morphologically
indistinguishable and to predict response to therapy. However, standard
data analysis techniques are not trivial to employ on these large data
sets. We report on the application of a panel of statistical and data
mining methodologies to classify groups of samples based on expression
of 12,000 genes derived from a high density oligonucleotide microarray
analysis of highly purified plasma cells from newly diagnosed MM, MGUS,
and normal healthy donors and the prediction errors for each of the models
and each of the methods. Additionally, we report ROC curves for the
comparisons of MM versus MGUS and results on predicting MGUS from a model
that distinguishes MM samples from normal samples.},
pdf = {http://www.waddellinformatics.com/download/waddell03toward.pdf},
ps = {http://www.waddellinformatics.com/download/waddell03toward.ps.gz},
keywords = {genomics, microarray, myeloma}
}
@MISC{waddell03predicting,
author = {Michael J. Waddell},
title = {Predicting Cancer Susceptibility from Single-Nucelotide
Polymorphism Data: A Case Study in Multiple Myeloma},
note = {National Library of Medicine Training Directors' Meeting 2003,
Bethesda, Maryland, July 9, 2003},
howpublished = {Seminar},
year = {2003},
month = {July},
pdf = {http://www.waddellinformatics.com/download/waddell03predicting.pdf},
ps = {http://www.waddellinformatics.com/download/waddell03predicting.ps.gz},
keywords = {snps, genomics, myeloma}
}
@MISC{waddell04modeling,
author = {Michael J. Waddell},
title = {Modeling Patterns in Single-Nucleotide Polymorphism Data for
Predicting Cancer Susceptibility: A Case Study in Multiple Myeloma},
note = {Computation and Informatics in Biology and Medicine (CIBM)
Seminar Series, Univerisity of Wisconsin-Madison, Madison, Wisconsin,
March 2, 2004},
howpublished = {Seminar},
year = {2004},
month = {Mar},
abstract = {
The past two decades have witnessed the identification of genes responsible
for a number of inherited human disorders. However, most of these
successes were with disorders that are caused by single genes.
Attempts to identify groups of genes that result in inherited disorders
through their collective action have been largely unsuccessful.
One reason for this difficulty is that standard genetics techniques
are more sensitive to large, consistent changes in single genes
than to consistent patterns of small changes in groups of genes. \\
\\
In order to find the groups of genes that result in these more complex
disorders, researchers need to identify groups of genes whose collective
action is consistent, even though individual genes may not be.
We propose that standard machine learning algorithms can be utilized
to address this goal. In this seminar, I will discuss using support
vector machines (SMVs) to model patterns in single nucleotide polymorphisms
(SNPs) that are associated with early versus late onset of a particularly
deadly form of cancer, Multiple Myeloma. The goal of building accurate
models is not only to assess risk, but to provide insight into the
disease and potentially offer novel drug targets.},
pdf = {http://www.waddellinformatics.com/download/waddell04modeling.pdf},
ps = {http://www.waddellinformatics.com/download/waddell04modeling.ps.gz},
keywords = {genomics, snps, myeloma}
}
@MISC{waddell04validating,
author = {Waddell, Michael J.},
title = {Validating the Effectiveness of Machine Learning Assistance},
note = {Computation and Informatics in Biology and Medicine (CIBM) Training
Program Retreat, University of Wisconsin-Madison, Madison,
Wisconsin, October 15, 2004},
howpublished = {Poster},
year = {2004},
month = {October},
abstract = {
This poster describes the results of the first part of a two-part study
to measure the effectiveness of providing users with learned hypotheses
when performing a learning task. The first part of the study uses a
simple domain based on the "East-West Challenge." In this study,
subjects are given a set of pictures of cartoon trains where half are
labeled "eastbound" and half are labeled "westbound." The subjects'
task is to correctly label a second, unlabeled set of trains using the
hypothesis they learned from the first set. Some subjects will receive
the output of a relational learning system to aid them in this task. In
the second part of this study, we will conduct a similar task using mass
spectrometry data. These preliminary studies will lay the groundwork for
the validation of other types of collaborative machine learning systems.
This type of validation is not routinely done when working with machine
learning systems, but instead it is assumed that any help a system can
provide is beneficial. This study will challenge that assumption and
propose instead that the type of assistance given by a system is as
important as the quality of its learned hypotheses.},
pdf = {http://www.waddellinformatics.com/pubs/waddell04validating.pdf},
ps = {http://www.waddellinformatics.com/pubs/waddell04validating.ps.gz},
keywords = {collaboration, validating, algorithms}
}
@ARTICLE{hardin04evaluation,
title = {Evaluation of Multiple Models to Distinguish Closely Related Forms
of Disease Using {DNA} Microarray Data},
author = {Johanna Hardin and Michael Waddell and Page, C. David and
Fenghuang Zhan and Bart Barlogie and Shaughnessy, Jr., John and
John Crowley},
journal = {Statistical Applications in Genetics and Molecular Biology},
volume = {3},
number = {1},
month = {June},
year = {2004},
abstract = {
Motivation: Standard laboratory classification of the plasma cell
dyscrasia monoclonal gammopathy of undetermined significance (MGUS) and
the overt plasma cell neoplasm multiple myeloma (MM) is quite accurate,
yet, for the most part, biologically uninformative. Most, if not all,
cancers are caused by inherited or acquired genetic mutations that
manifest themselves in altered gene expression patterns in the clonally
related cancer cells. Microarray technology allows for qualitative and
quantitative measurements of the expression levels of thousands of genes
simultaneously, and it has now been used both to classify cancers that
are morphologically indistinguishable and to predict response to therapy.
It is anticipated that this information can also be used to develop
molecular diagnostic models and to provide insight into mechanisms of
disease progression, e.g., transition from healthy to benign hyperplasia
or conversion of a benign hyperplasia to overt malignancy. However,
standard data analysis techniques are not trivial to employ on these
large data sets. Methodology designed to handle large data sets (or
modified to do so) is needed to access the vital information contained in
the genetic samples, which in turn can be used to develop more robust and
accurate methods of clinical diagnostics and prognostics. \\
\\
Results: Here we report on the application of a panel of statistical and
data mining methodologies to classify groups of samples based on
expression of 12,000 genes derived from a high density oligonucleotide
microarray analysis of highly purified plasma cells from newly diagnosed
MM, MGUS, and normal healthy donors. The three groups of samples are each
tested against each other. The methods are found to be similar in their
ability to predict group membership; all do quite well at predicting MM
vs. normal and MGUS vs. normal. However, no method appears to be able to
distinguish explicitly the genetic mechanisms between MM and MGUS. We
believe this might be due to the lack of genetic differences between
these two conditions, and may not be due to the failure of the models. We
report the prediction errors for each of the models and each of the
methods. Additionally, we report ROC curves for the comparisons and
results on predicting MGUS from a model that distinguishes MM samples
from normal samples.},
url = {http://www.bepress.com/sagmb/vol3/iss1/art10/},
pdf = {http://www.waddellinformatics.com/download/hardin04evaluation.pdf},
ps = {http://www.waddellinformatics.com/download/hardin04evaluation.ps.gz},
keywords = {myeloma, genomics, microarray, algorithms}
}
@INPROCEEDINGS{waddell05predicting,
author = {Michael Waddell and David Page and
Fenghuang Zhan and Bart Barlogie and Shaughnessy, Jr., John},
title = {Predicting Cancer Susceptibility from Single-Nucleotide Polymorphism
Data: A Case Study in Multiple Myeloma},
booktitle = {Proceedings of {BIOKDD} '05, Chicago, Illinois, August 2005},
month = {Aug},
year = {2005},
abstract = {
This paper asks whether susceptibility to early-onset (diagnosis before
age 40) of a particularly deadly form of cancer, Multiple My\-elo\-ma,
can be predicted from single-nucleotide polymorphism (SNP) profiles with
an accuracy greater than chance. Specifically, given SNP profiles for 80
Multiple My\-elo\-ma patients -- of which we believe 40 to have high
susceptibility and 40 to have lower susceptibility -- we train a support
vector machine (SVM) to predict age at diagnosis. We chose SVMs for this
task because they are well suited to deal with interactions among features
and redundant features. The accuracy of the trained SVM estimated by
leave-one-out cross-validation is 71\%, significantly greater than random
guessing. This result is particularly encouraging since only 3000 SNPs
were used in profiling, whereas several million SNPs are known.},
pdf = {http://www.waddellinformatics.com/download/waddell05predicting.pdf},
ps = {http://www.waddellinformatics.com/download/waddell05predicting.ps.gz},
data = {http://www.waddellinformatics.com/download/waddell05predicting.zip}
}
@MISC{waddell05predicting2,
author = {Michael Waddell and David Page and
Fenghuang Zhan and Bart Barlogie and Shaughnessy, Jr., John},
title = {Predicting Cancer Susceptibility from Single-Nucleotide Polymorphism
Data: A Case Study in Multiple Myeloma},
note = {{BIOKDD} '05, Chicago, Illinois, August 21 2005},
howpublished = {Seminar},
month = {Aug},
year = {2005},
abstract = {
This paper asks whether susceptibility to early-onset (diagnosis before
age 40) of a particularly deadly form of cancer, Multiple My\-elo\-ma,
can be predicted from single-nucleotide polymorphism (SNP) profiles with
an accuracy greater than chance. Specifically, given SNP profiles for 80
Multiple My\-elo\-ma patients -- of which we believe 40 to have high
susceptibility and 40 to have lower susceptibility -- we train a support
vector machine (SVM) to predict age at diagnosis. We chose SVMs for this
task because they are well suited to deal with interactions among features
and redundant features. The accuracy of the trained SVM estimated by
leave-one-out cross-validation is 71\%, significantly greater than random
guessing. This result is particularly encouraging since only 3000 SNPs
were used in profiling, whereas several million SNPs are known.},
pdf = {http://www.waddellinformatics.com/download/waddell05predicting2.pdf},
ps = {http://www.waddellinformatics.com/download/waddell05predicting2.ps.gz}
}