Industrial deployments of automated program repair (APR), e.g., at Facebook and Bloomberg, signal a new milestone for this exciting and potentially impactful technology. In these deployments, developers use APR-generated patch suggestions as part of a human-driven debugging process. Unfortunately, little is known about how using patch suggestions affects developers during debugging. This paper conducts a controlled user study with 40 developers with a median of 6 years of experience. The developers engage in debugging tasks on nine naturally-occurring defects in real-world, open-source, Java projects, using Recoder, SimFix, and TBar, three state-of-the-art APR tools. For each debugging task, the developers either have access to the project's tests, or, also, to code suggestions that make all the tests pass. These suggestions are either developer-written or APR-generated, which can be correct or deceptive. Deceptive suggestions, which are a common APR occurrence, make all the available tests pass but fail to generalize to the intended specification. Through a total of 160 debugging sessions, we find that access to a code suggestion significantly increases the odds of submitting a patch. Correct APR suggestions increase the odds of debugging success by 14,000%, but deceptive suggestions decrease the odds of success by 65%. Correct suggestions also speed up debugging. Surprisingly, we observe no significant difference in how novice and experienced developers are affected by APR, suggesting that APR may find uses across the experience spectrum. Overall, developers come away with a strong positive impression of APR, suggesting promise for APR-mediated, human-driven debugging, despite existing challenges in APR-generated repair quality.
@inproceedings{Eladawy24icse, author = {Hadeel Eladawy and Claire {Le Goues} and Yuriy Brun}, title = {\href{http://people.cs.umass.edu/brun/pubs/pubs/Eladawy24icse.pdf}{Automated Program Repair, What Is It Good For? {Not} Absolutely Nothing!}}, booktitle = {Proceedings of the 46th International Conference on Software Engineering (ICSE)}, venue = {ICSE}, address = {Lisbon, Portugal}, month = {April}, date = {14--20}, year = {2024}, pages = {1017--1029}, accept = {$\frac{234}{1,079} \approx 22\%$}, doi = {10.1145/3597503.3639095}, note = {ACM artifact badges granted: \href{https://www.acm.org/publications/policies/artifact-review-and-badging-current}{\raisebox{-.75ex}{\includegraphics[height=2.5ex]{ACMArtifactAvailable}}~Artifact Available, \raisebox{-.75ex}{\includegraphics[height=2.5ex]{ACMArtifactReusable}}~Artifact Reusable}. \href{https://doi.org/10.1145/3597503.3639095}{DOI: 10.1145/3597503.3639095}}, abstract = {<p>Industrial deployments of automated program repair (APR), e.g., at Facebook and Bloomberg, signal a new milestone for this exciting and potentially impactful technology. In these deployments, developers use APR-generated patch suggestions as part of a human-driven debugging process. Unfortunately, little is known about how using patch suggestions affects developers during debugging. This paper conducts a controlled user study with 40 developers with a median of 6 years of experience. The developers engage in debugging tasks on nine naturally-occurring defects in real-world, open-source, Java projects, using Recoder, SimFix, and TBar, three state-of-the-art APR tools. For each debugging task, the developers either have access to the project's tests, or, also, to code suggestions that make all the tests pass. These suggestions are either developer-written or APR-generated, which can be correct or deceptive. Deceptive suggestions, which are a common APR occurrence, make all the available tests pass but fail to generalize to the intended specification. Through a total of 160 debugging sessions, we find that access to a code suggestion significantly increases the odds of submitting a patch. Correct APR suggestions increase the odds of debugging success by 14,000%, but deceptive suggestions decrease the odds of success by 65%. Correct suggestions also speed up debugging. Surprisingly, we observe no significant difference in how novice and experienced developers are affected by APR, suggesting that APR may find uses across the experience spectrum. Overall, developers come away with a strong positive impression of APR, suggesting promise for APR-mediated, human-driven debugging, despite existing challenges in APR-generated repair quality.</p>}, fundedBy = {NSF CCF-1750116, NSF CCF-2210243}, }