Preventing Undesirable Behavior of Intelligent Machines"/> Preventing Undesirable Behavior of Intelligent Machines"/>
@article{Thomas19science,
author = {Philip S. Thomas and Bruno Castro {da Silva} and Andrew G. Barto and Stephen Giguere and Yuriy Brun and Emma Brunskill},
title = {Preventing Undesirable Behavior of Intelligent Machines},
journal = {Science},
venue = {Science},
year = {2019},
issn = {0036-8075},
volume = {366},
number = {6468},
month = {22November},
date = {22},
pages = {999--1004},
doi = {10.1126/science.aag3311},
note = {DOI: 10.1126/science.aag3311},
abstract = {Intelligent machines using machine learning algorithms are
ubiquitous, ranging from simple data analysis and pattern recognition tools
to complex systems that achieve super-human performance on various tasks.
Ensuring that they do not exhibit undesirable behavior — that they do
not, for example, cause harm to humans — is therefore a pressing
problem that we address here. We propose a general and flexible framework
for designing machine learning algorithms that simplifies the problem of
specifying and regulating undesirable behavior. To show the viability of
this framework, we use it to create machine learning algorithms that
preclude the dangerous behavior caused by standard machine learning
algorithms in our experiments. Our framework for designing machine learning
algorithms simplifies the safe and responsible application of machine
learning.},
fundedBy = {NSF 1350984, NSF CCF-1453474, NSF CCF-1763423,
Institute of Educational Science grant R305A130215},
}