by Jie Chen, Xiwei Xu, Leon J. Osterweil, Liming Zhu, Yuriy Brun, Len Bass, Junchao Xiao, Mingshu Li, Qing Wang
Abstract:
The processes for deploying systems in cloud environments can be the basis for studying strategies for detecting and correcting errors committed during complex process execution. These cloud-based processes encompass diverse activities, and entail complex interactions between cloud infrastructure, application software, tools, and humans. Many of these processes, such as those for making release decisions during continuous deployment and troubleshooting in system upgrades, are highly error-prone. Unlike the typically well-tested deployed software systems, these deployment processes are usually neither well understood nor well tested. Errors that occur during such processes may require time-consuming troubleshooting, undoing and redoing steps, and problem fixing. Consequently, these processes should ideally be guided by strategies for detecting errors that consider trade-offs between efficiency and reliability. This paper presents a framework for systematically exploring such trade-offs. To evaluate the framework and illustrate our approach, we use two representative cloud deployment processes: a continuous deployment process and a rolling upgrade process. We augment an existing process modeling language to represent these processes and model errors that may occur during process execution. We use a process-aware discrete-event simulator to evaluate strategies and empirically validate simulation results by comparing them to experiences in a production environment. Our evaluation demonstrates that our approach supports the study of how error-handling strategies affect how much time is taken for task-completion and error-fixing.
Citation:
Jie Chen, Xiwei Xu, Leon J. Osterweil, Liming Zhu, Yuriy Brun, Len Bass, Junchao Xiao, Mingshu Li, and Qing Wang, Using Simulation to Evaluate Error Detection Strategies: A Case Study of Cloud-Based Deployment Processes, Journal of Systems and Software, vol. 110, December 2015, pp. 205–221.
Bibtex:
@article{Chen15,
author = {Jie Chen and Xiwei Xu and Leon J. Osterweil and Liming Zhu and
Yuriy Brun and Len Bass and Junchao Xiao and Mingshu Li and Qing Wang},
title = {\href{http://people.cs.umass.edu/brun/pubs/pubs/Chen15.pdf}{Using
Simulation to Evaluate Error Detection Strategies: {A} Case Study of
Cloud-Based Deployment Processes}},
journal = {Journal of Systems and Software},
venue = {JSS},
year = {2015},
volume = {110},
pages = {205--221},
month = {December},
doi = {10.1016/j.jss.2015.08.043},
note = {\href{https://doi.org/10.1016/j.jss.2015.08.043}{DOI:
10.1016/j.jss.2015.08.043}},
abstract = {The processes for deploying systems in cloud environments can
be the basis for studying strategies for detecting and correcting errors
committed during complex process execution. These cloud-based processes
encompass diverse activities, and entail complex interactions between cloud
infrastructure, application software, tools, and humans. Many of these
processes, such as those for making release decisions during continuous
deployment and troubleshooting in system upgrades, are highly error-prone.
Unlike the typically well-tested deployed software systems, these
deployment processes are usually neither well understood nor well tested.
Errors that occur during such processes may require time-consuming
troubleshooting, undoing and redoing steps, and problem fixing.
Consequently, these processes should ideally be guided by strategies for
detecting errors that consider trade-offs between efficiency and
reliability. This paper presents a framework for systematically exploring
such trade-offs. To evaluate the framework and illustrate our approach, we
use two representative cloud deployment processes: a continuous deployment
process and a rolling upgrade process. We augment an existing process
modeling language to represent these processes and model errors that may
occur during process execution. We use a process-aware discrete-event
simulator to evaluate strategies and empirically validate simulation
results by comparing them to experiences in a production environment. Our
evaluation demonstrates that our approach supports the study of how
error-handling strategies affect how much time is taken for task-completion
and error-fixing.},
fundedBy = {NSF IIS-1239334, NSF CNS-1258588, NSF IIS-0705772,
Natural Science Foundation of China 91318301,
Natural Science Foundation of China 91218302},
}