Baldur: Whole-Proof Generation and Repair with Large Language Models"/> Baldur: Whole-Proof Generation and Repair with Large Language Models"/>
Formally verifying software properties is a highly desirable but labor-intensive task. Recent work has developed methods to automate formal verification using proof assistants, such as Coq and Isabelle/HOL, e.g., by training a model to predict one proof step at a time, and using that model to search through the space of possible proofs. This paper introduces a new method to automate formal verification: We use large language models, trained on natural language text and code and fine-tuned on proofs, to generate whole proofs for theorems at once, rather than one step at a time. We combine this proof generation model with a fine-tuned repair model to repair generated proofs, further increasing proving power. As its main contributions, this paper demonstrates for the first time that: (1) Whole-proof generation using transformers is possible and is as effective as search-based techniques without requiring costly search. (2) Giving the learned model additional context, such as a prior failed proof attempt and the ensuing error message, results in proof repair and further improves automated proof generation. (3) We establish a new state of the art for fully automated proof synthesis. We reify our method in a prototype, Baldur, and evaluate it on a benchmark of 6,336 Isabelle/HOL theorems and their proofs. In addition to empirically showing the effectiveness of whole-proof generation, repair, and added context, we show that Baldur improves on the state-of-the-art tool, Thor, by automatically generating proofs for an additional 8.7% of the theorems. Together, Baldur and Thor can prove 65.7% of the theorems fully automatically. This paper paves the way for new research into using large language models for automating formal verification.
@inproceedings{First23fse,
author = {Emily First and Markus Rabe and Talia Ringer and Yuriy Brun},
title = {Baldur: {Whole}-Proof Generation and Repair with Large Language Models},
booktitle = {Proceedings of the 30th ACM Joint European Software
Engineering Conference and Symposium on the Foundations of Software
Engineering (ESEC/FSE)},
pages = {1229--1241},
venue = {ESEC/FSE},
month = {December},
year = {2023},
date = {6--8},
address = {San Francisco, CA, USA},
doi = {10.1145/3611643.3616243},
note = {ACM SIGSOFT Distinguished Paper Award.
DOI:
10.1145/3611643.3616243, arXiv: abs/2303.04910},
comment = {<span class="emphasis">ACM SIGSOFT Distinguished Paper Award</span>},
accept = {$\frac{60}{473} \approx 13\%$ (direct accept, without revision)},
abstract = {Formally verifying software properties is a highly desirable
but labor-intensive task. Recent work has developed methods to automate
formal verification using proof assistants, such as Coq and Isabelle/HOL,
e.g., by training a model to predict one proof step at a time, and using
that model to search through the space of possible proofs. This paper
introduces a new method to automate formal verification: We use large
language models, trained on natural language text and code and fine-tuned
on proofs, to generate whole proofs for theorems at once, rather than one
step at a time. We combine this proof generation model with a fine-tuned
repair model to repair generated proofs, further increasing proving power.
As its main contributions, this paper demonstrates for the first time that:
(1) Whole-proof generation using transformers is possible and is as
effective as search-based techniques without requiring costly search. (2)
Giving the learned model additional context, such as a prior failed proof
attempt and the ensuing error message, results in proof repair and further
improves automated proof generation. (3) We establish a new state of the
art for fully automated proof synthesis. We reify our method in a
prototype, Baldur, and evaluate it on a benchmark of 6,336 Isabelle/HOL
theorems and their proofs. In addition to empirically showing the
effectiveness of whole-proof generation, repair, and added context, we show
that Baldur improves on the state-of-the-art tool, Thor, by automatically
generating proofs for an additional 8.7% of the theorems. Together, Baldur
and Thor can prove 65.7% of the theorems fully automatically. This paper
paves the way for new research into using large language models for
automating formal verification.},
fundedBy = {DARPA HR0011-22-9-0063, NSF CCF-2210243},
}